diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java index 339ee89e9e482130aa26c93cd1904f72d3026460..fbae39bb67e433a2983a77dc7f7326ed87897a0e 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java @@ -37,13 +37,11 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit */ package com.amd.aparapi; -import java.util.logging.Handler; -import java.util.logging.Level; -import java.util.logging.Logger; +import com.amd.aparapi.internal.instruction.*; +import com.amd.aparapi.internal.jni.*; +import com.amd.aparapi.internal.tool.*; -import com.amd.aparapi.internal.instruction.Instruction; -import com.amd.aparapi.internal.jni.ConfigJNI; -import com.amd.aparapi.internal.tool.InstructionViewer; +import java.util.logging.*; /** * A central location for holding all runtime configurable properties as well as logging configuration. @@ -99,6 +97,14 @@ public class Config extends ConfigJNI{ * */ public static final boolean enableShowGeneratedOpenCL = Boolean.getBoolean(propPkgName + ".enableShowGeneratedOpenCL"); + + /** + * Upon exiting the JVM, dumps kernel profiling info to standard out. + * + * Usage -Dcom.amd.aparapi.dumpProfilesOnExit={true|false} + * + */ + public static final boolean dumpProfilesOnExit = Boolean.getBoolean(propPkgName + ".dumpProfilesOnExit"); // Pragma/OpenCL codegen related flags public static final boolean enableAtomic32 = Boolean.getBoolean(propPkgName + ".enableAtomic32"); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java index 708005ccde41337bd9e23bf26fb84043a632e2db..8bead23faddde914beb74a491a53ad23d1d03864 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java @@ -38,8 +38,9 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi; import com.amd.aparapi.annotation.Experimental; +import com.amd.aparapi.device.*; import com.amd.aparapi.exception.DeprecatedException; -import com.amd.aparapi.internal.kernel.KernelRunner; +import com.amd.aparapi.internal.kernel.*; import com.amd.aparapi.internal.model.CacheEnabler; import com.amd.aparapi.internal.model.ClassModel.ConstantPool.MethodReferenceEntry; import com.amd.aparapi.internal.model.ClassModel.ConstantPool.NameAndTypeEntry; @@ -47,7 +48,7 @@ import com.amd.aparapi.internal.model.ValueCache; import com.amd.aparapi.internal.model.ValueCache.ThrowingValueComputer; import com.amd.aparapi.internal.model.ValueCache.ValueComputer; import com.amd.aparapi.internal.opencl.OpenCLLoader; -import com.amd.aparapi.internal.util.UnsafeWrapper; +import com.amd.aparapi.internal.util.*; import java.lang.annotation.Annotation; import java.lang.annotation.ElementType; @@ -55,14 +56,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; import java.lang.reflect.Method; -import java.util.ArrayDeque; -import java.util.Arrays; -import java.util.Deque; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.BrokenBarrierException; import java.util.concurrent.CyclicBarrier; import java.util.logging.Logger; @@ -314,7 +308,13 @@ public abstract class Kernel implements Cloneable { } /** - * The <i>execution mode</i> ENUM enumerates the possible modes of executing a kernel. + * @deprecated It is no longer recommended that {@code EXECUTION_MODE}s are used, as a more sophisticated {@link com.amd.aparapi.device.Device} + * preference mechanism is in place, see {@link com.amd.aparapi.internal.kernel.KernelManager}. Though {@link #setExecutionMode(EXECUTION_MODE)} + * is still honored, the default EXECUTION_MODE is now {@link EXECUTION_MODE#AUTO}, which indicates that the KernelManager + * will determine execution behaviours. + * + * <p> + * The <i>execution mode</i> ENUM enumerates the possible modes of executing a kernel. * One can request a mode of execution using the values below, and query a kernel after it first executes to * determine how it executed. * @@ -354,8 +354,12 @@ public abstract class Kernel implements Cloneable { * @author gfrost AMD Javalabs * @version Alpha, 21/09/2010 */ - + @Deprecated public static enum EXECUTION_MODE { + /** + * + */ + AUTO, /** * A dummy value to indicate an unknown state. */ @@ -389,27 +393,9 @@ public abstract class Kernel implements Cloneable { */ ACC; - static EXECUTION_MODE getDefaultExecutionMode() { - EXECUTION_MODE defaultExecutionMode = OpenCLLoader.isOpenCLAvailable() ? GPU : JTP; - final String executionMode = Config.executionMode; - if (executionMode != null) { - try { - EXECUTION_MODE requestedExecutionMode; - requestedExecutionMode = getExecutionModeFromString(executionMode).iterator().next(); - logger.fine("requested execution mode ="); - if ((OpenCLLoader.isOpenCLAvailable() && requestedExecutionMode.isOpenCL()) || !requestedExecutionMode.isOpenCL()) { - defaultExecutionMode = requestedExecutionMode; - } - } catch (final Throwable t) { - // we will take the default - } - } - - logger.fine("default execution modes = " + defaultExecutionMode); - - return (defaultExecutionMode); - } - + /** + * @deprecated See {@link EXECUTION_MODE}. + */ static LinkedHashSet<EXECUTION_MODE> getDefaultExecutionModes() { LinkedHashSet<EXECUTION_MODE> defaultExecutionModes = new LinkedHashSet<EXECUTION_MODE>(); @@ -956,6 +942,26 @@ public abstract class Kernel implements Cloneable { */ public abstract void run(); + /** False by default. In the event that all preferred devices fail to execute a kernel, it is possible to supply an alternate (possibly non-parallel) + * execution algorithm by overriding this method to return true, and overriding {@link #executeFallbackAlgorithm(Range, int)} with the alternate + * algorithm. + */ + public boolean hasFallbackAlgorithm() { + return false; + } + + /** If {@link #hasFallbackAlgorithm()} has been overriden to return true, this method should be overriden so as to + * apply a single pass of the kernel's logic to the entire _range. + * + * <p> + * This is not normally required, as fallback to {@link JavaDevice#THREAD_POOL} will implement the algorithm in parallel. However + * in the event that thread pool execution may be prohibitively slow, this method might implement a "quick and dirty" approximation + * to the desired result (for example, a simple box-blur as opposed to a gaussian blur in an image processing application). + */ + public void executeFallbackAlgorithm(Range _range, int _passId) { + // nothing + } + /** * Invoking this method flags that once the current pass is complete execution should be abandoned. Due to the complexity of intercommunication * between java (or C) and executing OpenCL, this is the best we can do for general cancellation of execution at present. OpenCL 2.0 should introduce @@ -1930,26 +1936,29 @@ public abstract class Kernel implements Cloneable { return kernelState; } + private KernelRunner prepareKernelRunner() { + if (kernelRunner == null) { + kernelRunner = new KernelRunner(this); + } + return kernelRunner; + } + /** * Determine the execution time of the previous Kernel.execute(range) call. - * - * Note that for the first call this will include the conversion time. - * - * @return The time spent executing the kernel (ms) - * + * + * Note that for the first call this will include the conversion time. + * + * @return The time spent executing the kernel (ms) + * * @see #getConversionTime(); * @see #getAccumulatedExecutionTime(); - * + * */ - public synchronized long getExecutionTime() { - return prepareKernelRunner().getExecutionTime(); - } - - private KernelRunner prepareKernelRunner() { - if (kernelRunner == null) { - kernelRunner = new KernelRunner(this); + public double getExecutionTime() { + KernelProfile profile = KernelManager.instance().getProfile(getClass()); + synchronized (profile) { + return profile.getLastExecutionTime(); } - return kernelRunner; } /** @@ -1963,8 +1972,11 @@ public abstract class Kernel implements Cloneable { * @see #getConversionTime(); * */ - public synchronized long getAccumulatedExecutionTime() { - return prepareKernelRunner().getAccumulatedExecutionTime(); + public double getAccumulatedExecutionTime() { + KernelProfile profile = KernelManager.instance().getProfile(getClass()); + synchronized (profile) { + return profile.getAccumulatedTotalTime(); + } } /** @@ -1974,8 +1986,11 @@ public abstract class Kernel implements Cloneable { * @see #getExecutionTime(); * @see #getAccumulatedExecutionTime(); */ - public synchronized long getConversionTime() { - return prepareKernelRunner().getConversionTime(); + public double getConversionTime() { + KernelProfile profile = KernelManager.instance().getProfile(getClass()); + synchronized (profile) { + return profile.getLastConversionTime(); + } } /** @@ -1992,10 +2007,30 @@ public abstract class Kernel implements Cloneable { return (execute(_range, 1)); } + @Override + @SuppressWarnings("deprecation") + public String toString() { + if (executionMode == EXECUTION_MODE.AUTO) { + List<Device> preferredDevices = KernelManager.instance().getPreferences(this).getPreferredDevices(this); + StringBuilder preferredDevicesSummary = new StringBuilder("{"); + for (int i = 0; i < preferredDevices.size(); ++i) { + Device device = preferredDevices.get(i); + preferredDevicesSummary.append(device.getShortDescription()); + if (i < preferredDevices.size() - 1) { + preferredDevicesSummary.append("|"); + } + } + preferredDevicesSummary.append("}"); + return Reflection.getSimpleName(getClass()) + ", devices=" + preferredDevicesSummary.toString(); + } else { + return Reflection.getSimpleName(getClass()) + ", modes=" + executionModes + ", current = " + executionMode; + } + } + /** * Start execution of <code>_range</code> kernels. * <p> - * When <code>kernel.execute(_range)</code> is invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then + * When <code>kernel.execute(_range)</code> is 1invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. * <p> * Since adding the new <code>Range class</code> this method offers backward compatibility and merely defers to <code> return (execute(Range.create(_range), 1));</code>. @@ -2004,7 +2039,18 @@ public abstract class Kernel implements Cloneable { * */ public synchronized Kernel execute(int _range) { - return (execute(Range.create(_range), 1)); + return (execute(createRange(_range), 1)); + } + + @SuppressWarnings("deprecation") + protected Range createRange(int _range) { + if (executionMode.equals(EXECUTION_MODE.AUTO)) { + Device device = getTargetDevice(); + Range range = Range.create(device, _range); + return range; + } else { + return Range.create(null, _range); + } } /** @@ -2033,21 +2079,7 @@ public abstract class Kernel implements Cloneable { * */ public synchronized Kernel execute(int _range, int _passes) { - return (execute(Range.create(_range), _passes)); - } - - /** - * Start execution of <code>globalSize</code> kernels for the given entrypoint. - * <p> - * When <code>kernel.execute("entrypoint", globalSize)</code> is invoked, Aparapi will schedule the execution of <code>globalSize</code> kernels. If the execution mode is GPU then - * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. - * <p> - * @param _entry is the name of the method we wish to use as the entrypoint to the kernel - * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr) - * - */ - public synchronized Kernel execute(Entry _entry, Range _range) { - return prepareKernelRunner().execute(_entry, _range, 1); + return (execute(createRange(_range), _passes)); } /** @@ -2093,7 +2125,22 @@ public abstract class Kernel implements Cloneable { } } + public boolean isRunningCL() { + return getTargetDevice() instanceof OpenCLDevice; + } + + public final Device getTargetDevice() { + return KernelManager.instance().getPreferences(this).getPreferredDevice(this); + } + + /** @return true by default, may be overriden to allow vetoing of a device or devices by a given Kernel instance. */ + public boolean isAllowDevice(Device _device) { + return true; + } + /** + * @deprecated See {@link EXECUTION_MODE} + * <p> * Return the current execution mode. * * Before a Kernel executes, this return value will be the execution mode as determined by the setting of @@ -2108,11 +2155,14 @@ public abstract class Kernel implements Cloneable { * * @see #setExecutionMode(EXECUTION_MODE) */ + @Deprecated public EXECUTION_MODE getExecutionMode() { return (executionMode); } /** + * @deprecated See {@link EXECUTION_MODE} + * <p> * Set the execution mode. * <p> * This should be regarded as a request. The real mode will be determined at runtime based on the availability of OpenCL and the characteristics of the workload. @@ -2121,10 +2171,15 @@ public abstract class Kernel implements Cloneable { * * @see #getExecutionMode() */ + @Deprecated public void setExecutionMode(EXECUTION_MODE _executionMode) { executionMode = _executionMode; } + /** + * @deprecated See {@link EXECUTION_MODE} + */ + @Deprecated public void setFallbackExecutionMode() { executionMode = EXECUTION_MODE.getFallbackExecutionMode(); } @@ -2718,13 +2773,24 @@ public abstract class Kernel implements Cloneable { return prepareKernelRunner().getProfileInfo(); } - private final LinkedHashSet<EXECUTION_MODE> executionModes = EXECUTION_MODE.getDefaultExecutionModes(); + /** + * @deprecated See {@link EXECUTION_MODE}. + */ + private final LinkedHashSet<EXECUTION_MODE> executionModes = (Config.executionMode != null) ? EXECUTION_MODE.getDefaultExecutionModes() : new LinkedHashSet<>(Collections.singleton(EXECUTION_MODE.AUTO)); + /** + * @deprecated See {@link EXECUTION_MODE}. + */ private Iterator<EXECUTION_MODE> currentMode = executionModes.iterator(); + /** + * @deprecated See {@link EXECUTION_MODE}. + */ private EXECUTION_MODE executionMode = currentMode.next(); /** + * @deprecated See {@link EXECUTION_MODE}. + * <p> * set possible fallback path for execution modes. * for example setExecutionFallbackPath(GPU,CPU,JTP) will try to use the GPU * if it fails it will fall back to OpenCL CPU and finally it will try JTP. @@ -2736,6 +2802,7 @@ public abstract class Kernel implements Cloneable { } /** + * @deprecated See {@link EXECUTION_MODE}. * @return is there another execution path we can try */ public boolean hasNextExecutionMode() { @@ -2743,6 +2810,7 @@ public abstract class Kernel implements Cloneable { } /** + * @deprecated See {@link EXECUTION_MODE}. * try the next execution path in the list if there aren't any more than give up */ public void tryNextExecutionMode() { diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java index 5fb435a46b0535215833f1dea888fa4934db17bd..75db2c245b680a1e4f4b9d134a07f048292755d6 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java @@ -1,9 +1,9 @@ package com.amd.aparapi; -import java.util.Arrays; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.jni.*; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.internal.jni.RangeJNI; +import java.util.*; /** * @@ -56,7 +56,7 @@ public class Range extends RangeJNI{ public static final int MAX_GROUP_SIZE = Math.max(Runtime.getRuntime().availableProcessors() * THREADS_PER_CORE, MAX_OPENCL_GROUP_SIZE); - private Device device = null; + private OpenCLDevice device = null; private int maxWorkGroupSize; @@ -73,7 +73,7 @@ public class Range extends RangeJNI{ * @param _dims */ public Range(Device _device, int _dims) { - device = _device; + device = !(_device instanceof OpenCLDevice) ? null : (OpenCLDevice) _device; dims = _dims; if (device != null) { @@ -317,7 +317,7 @@ public class Range extends RangeJNI{ * For example for <code>MAX_GROUP_SIZE</code> of 64 we favor 4x4x4 over 1x16x16. * * @param _globalWidth the width of the 3D grid we wish to process - * @param _globalHieght the height of the 3D grid we wish to process + * @param _globalHeight the height of the 3D grid we wish to process * @param _globalDepth the depth of the 3D grid we wish to process * @return */ diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java index a4bfcdeb9d6411ce52e9593e41d2fd9f3294a9eb..c3790880b8278ac5689b02e0da67fcb1b934e1e1 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java @@ -1,79 +1,76 @@ package com.amd.aparapi.device; -import com.amd.aparapi.Range; -import com.amd.aparapi.device.OpenCLDevice.DeviceComparitor; -import com.amd.aparapi.device.OpenCLDevice.DeviceSelector; +import com.amd.aparapi.*; +import com.amd.aparapi.internal.kernel.*; public abstract class Device{ public static enum TYPE { - UNKNOWN, - GPU, - CPU, - JTP, - SEQ, - ACC + UNKNOWN(Integer.MAX_VALUE), + GPU(2), + CPU(3), + JTP(5), + SEQ(6), + ACC(1), + ALT(4); + + /** Heuristic ranking of device types, lower is better. */ + public final int rank; + + TYPE(int rank) { + this.rank = rank; + } }; - /** - * @return Now return the device of any types having the maximum compute units + /** @deprecated use {@link KernelManager#bestDevice()} + * @see com.amd.aparapi.device */ + @Deprecated public static Device best() { - return (OpenCLDevice.select(new DeviceComparitor(){ - @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) { - if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) { - return (_deviceLhs); - } else { - return (_deviceRhs); - } - } - })); + return KernelManager.instance().bestDevice(); } + /** + * @see com.amd.aparapi.device + */ + @SuppressWarnings("deprecation") + @Deprecated public static Device bestGPU() { - return (OpenCLDevice.select(new DeviceComparitor(){ - @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) { - if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) { - return (_deviceLhs); - } else { - return (_deviceRhs); - } - } - }, Device.TYPE.GPU)); - } - - public static Device bestACC() { - return (OpenCLDevice.select(new DeviceComparitor(){ - @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) { - if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) { - return (_deviceLhs); - } else { - return (_deviceRhs); - } - } - }, Device.TYPE.ACC)); + return firstGPU(); } + /** + * @see com.amd.aparapi.device + */ + @Deprecated public static Device first(final Device.TYPE _type) { - return (OpenCLDevice.select(new DeviceSelector(){ - @Override public OpenCLDevice select(OpenCLDevice _device) { - return (_device.getType() == _type ? _device : null); - } - })); + return KernelManager.DeprecatedMethods.firstDevice(_type); } + /** + * @see com.amd.aparapi.device + */ + @SuppressWarnings("deprecation") + @Deprecated public static Device firstGPU() { - return (first(Device.TYPE.GPU)); + return KernelManager.DeprecatedMethods.firstDevice(TYPE.GPU); } + /** + * @see com.amd.aparapi.device + */ + @SuppressWarnings("deprecation") + @Deprecated public static Device firstCPU() { - return (first(Device.TYPE.CPU)); - + return KernelManager.DeprecatedMethods.firstDevice(TYPE.CPU); } - public static Device firstACC() { - return (first(Device.TYPE.ACC)); - + /** + * @see com.amd.aparapi.device + */ + @Deprecated + public static Device bestACC() { + throw new UnsupportedOperationException(); } protected TYPE type = TYPE.UNKNOWN; @@ -88,6 +85,8 @@ public abstract class Device{ 0 }; + public abstract String getShortDescription(); + public TYPE getType() { return type; } @@ -144,4 +143,25 @@ public abstract class Device{ int _localDepth) { return (Range.create3D(this, _globalWidth, _globalHeight, _globalDepth, _localWidth, _localHeight, _localDepth)); } + + public abstract long getDeviceId(); + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof Device)) { + return false; + } + + Device device = (Device) o; + + return getDeviceId() == device.getDeviceId(); + } + + @Override + public int hashCode() { + return Long.hashCode(getDeviceId()); + } } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java index 78082e77d74d0512dd91df791b130b7beec75bf8..33f5cd4d22e02c6b7f31dc731995e2f906c5fda6 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java @@ -1,5 +1,32 @@ package com.amd.aparapi.device; -public class JavaDevice extends Device{ +public class JavaDevice extends Device { + public static final JavaDevice THREAD_POOL = new JavaDevice(TYPE.JTP, "Java Thread Pool", -3); + public static final JavaDevice ALTERNATIVE_ALGORITHM = new JavaDevice(TYPE.ALT, "Java Alternative Algorithm", -2); + public static final JavaDevice SEQUENTIAL = new JavaDevice(TYPE.SEQ, "Java Sequential", -1); + + private final String name; + private final long deviceId; + + private JavaDevice(TYPE _type, String _name, long deviceId) { + this.deviceId = deviceId; + this.type = _type; + this.name = _name; + } + + @Override + public String getShortDescription() { + return name; + } + + @Override + public long getDeviceId() { + return deviceId; + } + + @Override + public String toString() { + return getShortDescription(); + } } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java index 61bfe548a2b292191f91de30bc77f74a70a3b615..ce196121488778aaee505202071d933a181d46c8 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java @@ -1,34 +1,15 @@ package com.amd.aparapi.device; -import com.amd.aparapi.ProfileInfo; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.lang.annotation.Annotation; -import java.lang.reflect.InvocationHandler; -import java.lang.reflect.Method; -import java.lang.reflect.Proxy; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import com.amd.aparapi.Range; -import com.amd.aparapi.internal.opencl.OpenCLArgDescriptor; -import com.amd.aparapi.internal.opencl.OpenCLKernel; -import com.amd.aparapi.internal.opencl.OpenCLPlatform; -import com.amd.aparapi.internal.opencl.OpenCLProgram; -import com.amd.aparapi.opencl.OpenCL; -import com.amd.aparapi.opencl.OpenCL.Arg; -import com.amd.aparapi.opencl.OpenCL.Constant; -import com.amd.aparapi.opencl.OpenCL.GlobalReadOnly; -import com.amd.aparapi.opencl.OpenCL.GlobalReadWrite; -import com.amd.aparapi.opencl.OpenCL.GlobalWriteOnly; +import com.amd.aparapi.*; +import com.amd.aparapi.internal.opencl.*; +import com.amd.aparapi.opencl.*; +import com.amd.aparapi.opencl.OpenCL.*; import com.amd.aparapi.opencl.OpenCL.Kernel; -import com.amd.aparapi.opencl.OpenCL.Local; -import com.amd.aparapi.opencl.OpenCL.Resource; -import com.amd.aparapi.opencl.OpenCL.Source; + +import java.io.*; +import java.lang.annotation.*; +import java.lang.reflect.*; +import java.util.*; public class OpenCLDevice extends Device{ @@ -44,6 +25,8 @@ public class OpenCLDevice extends Device{ private long maxMemAllocSize; + private String shortDescription = null; + /** * Minimal constructor * @@ -101,6 +84,18 @@ public class OpenCLDevice extends Device{ return (deviceId); } + @Override + public String getShortDescription() { + if (shortDescription == null) { + String vendor = platform.getName(); + // Hopefully(!) this equates to the recognisable name of the vendor, e.g. "Intel", "NVIDIA", "AMD" + // Note, it is not necessarily the hardware vendor, e.g. if the AMD CPU driver (i.e. platform) is used for an Intel CPU, this will be "AMD" + String[] split = vendor.split("[\\s\\(\\)]"); // split on whitespace or on '(' or ')' since Intel use "Intel(R)" here + shortDescription = split[0] + "<" + getType() + ">"; + } + return shortDescription; + } + public static class OpenCLInvocationHandler<T extends OpenCL<T>> implements InvocationHandler{ private final Map<String, OpenCLKernel> map; @@ -380,8 +375,6 @@ public class OpenCLDevice extends Device{ } } - // System.out.println("opencl{\n" + _source + "\n}opencl"); - final OpenCLProgram program = new OpenCLProgram(this, _source).createProgram(this); final Map<String, OpenCLKernel> map = new HashMap<String, OpenCLKernel>(); @@ -412,6 +405,22 @@ public class OpenCLDevice extends Device{ OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs); } + /** List OpenCLDevices of a given TYPE, or all OpenCLDevices if type == null. */ + public static List<OpenCLDevice> listDevices(TYPE type) { + final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null); + final ArrayList<OpenCLDevice> results = new ArrayList<>(); + + for (final OpenCLPlatform p : platform.getOpenCLPlatforms()) { + for (final OpenCLDevice device : p.getOpenCLDevices()) { + if (type == null || device.getType() == type) { + results.add(device); + } + } + } + + return results; + } + public static OpenCLDevice select(DeviceSelector _deviceSelector) { OpenCLDevice device = null; final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null); @@ -435,8 +444,10 @@ public class OpenCLDevice extends Device{ OpenCLDevice device = null; final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null); - for (final OpenCLPlatform p : platform.getOpenCLPlatforms()) { - for (final OpenCLDevice d : p.getOpenCLDevices()) { + List<OpenCLPlatform> openCLPlatforms = platform.getOpenCLPlatforms(); + for (final OpenCLPlatform p : openCLPlatforms) { + List<OpenCLDevice> openCLDevices = p.getOpenCLDevices(); + for (final OpenCLDevice d : openCLDevices) { if (device == null) { device = d; } else { @@ -466,7 +477,6 @@ public class OpenCLDevice extends Device{ return (device); } - @Override public String toString() { final StringBuilder s = new StringBuilder("{"); boolean first = true; @@ -482,7 +492,8 @@ public class OpenCLDevice extends Device{ s.append("}"); - return ("Device " + deviceId + "\n type:" + type + "\n maxComputeUnits=" + maxComputeUnits + "\n maxWorkItemDimensions=" + return ("Device " + deviceId + "\n vendor = " + getOpenCLPlatform().getVendor() + + "\n type:" + type + "\n maxComputeUnits=" + maxComputeUnits + "\n maxWorkItemDimensions=" + maxWorkItemDimensions + "\n maxWorkItemSizes=" + s + "\n maxWorkWorkGroupSize=" + maxWorkGroupSize + "\n globalMemSize=" + globalMemSize + "\n localMemSize=" + localMemSize); } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java index babe06a6e60af0c368db8ffce5c5bfb85a16fd6b..039f1883909ce1a6b8934baa634a17763d44869f 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java @@ -1,4 +1,19 @@ /** + * Contains classes representing OpenCL-capable devices, and "virtual" (java) devices which execute kernels using java. + * + * <p>Various methods of {@link com.amd.aparapi.device.Device} which selected devices of a particular type have been deprecated, + * as now the preferred mechanism for device selection is to rely on the {@link com.amd.aparapi.internal.kernel.KernelManager} to + * select an appropriate device. Where a particular device is required to be used for a certain kernel, for such purposes as + * debugging or unit testing, this can be achieved by using + * {@link com.amd.aparapi.internal.kernel.KernelManager#setKernelManager(com.amd.aparapi.internal.kernel.KernelManager)} prior to + * invoking any Kernel executions, by overriding {@link com.amd.aparapi.Kernel#isAllowDevice(com.amd.aparapi.device.Device)} + * to veto/approve devices from the available devices for a given Kernel class, or (not recommended) by using + * {@link com.amd.aparapi.internal.kernel.KernelManager#setPreferredDevices(com.amd.aparapi.Kernel, java.util.LinkedHashSet)} to specify + * a particular device list for a given Kernel class. + * + * <p>In order to determine the Device which will be used to execute a particular Kernel, use {@link com.amd.aparapi.Kernel#getTargetDevice()}. + * This can also be used immediately after execution to see on which device the kernel actually got executed (in case the execution failed + * and fell back to another device). * */ package com.amd.aparapi.device; \ No newline at end of file diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java index 9599bf0469be570c30199d78fc409e3cd76ad823..ce34d6d062e3cad65231c82e84cfc6491efa0ef2 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java @@ -1,11 +1,11 @@ package com.amd.aparapi.internal.kernel; -import java.lang.reflect.Field; -import java.nio.ByteBuffer; +import com.amd.aparapi.internal.jni.*; +import com.amd.aparapi.internal.model.*; +import com.amd.aparapi.internal.util.*; -import com.amd.aparapi.Kernel; -import com.amd.aparapi.internal.jni.KernelArgJNI; -import com.amd.aparapi.internal.model.ClassModel; +import java.lang.reflect.*; +import java.nio.*; /** * Each field (or captured field in the case of an anonymous inner class) referenced by any bytecode reachable from the users Kernel.run(), will @@ -48,7 +48,7 @@ public class KernelArg extends KernelArgJNI{ * Default constructor */ protected KernelArg() { - + // empty } /** @@ -260,4 +260,9 @@ public class KernelArg extends KernelArgJNI{ protected void setDims(int[] dims) { this.dims = dims; } + + @Override + public String toString() { + return Reflection.getSimpleName(field.getType()) + " " + field.getName(); + } } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java new file mode 100644 index 0000000000000000000000000000000000000000..87e221ef9060e348a9126c53d1590ff4b3b2eee4 --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java @@ -0,0 +1,193 @@ +package com.amd.aparapi.internal.kernel; + +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; + +import java.text.*; +import java.util.*; +import java.util.logging.*; + +/** + * Created by Barney on 02/09/2015. + */ +public class KernelDeviceProfile { + + private static Logger logger = Logger.getLogger(Config.getLoggerName()); + private static final double MILLION = 1000 * 1000; + private static final int TABLE_COLUMN_HEADER_WIDTH = 21; + private static final int TABLE_COLUMN_COUNT_WIDTH = 8; + private static final int TABLE_COLUMN_WIDTH; + private final Class<? extends Kernel> kernel; + private final Device device; + private long[] currentTimes = new long[ProfilingEvent.values().length]; + private long[] accumulatedTimes = new long[ProfilingEvent.values().length]; + private ProfilingEvent lastEvent = null; + private final DecimalFormat format; + private long invocationCount = 0; + + static { + assert ProfilingEvent.START.ordinal() == 0 : "ProfilingEvent.START.ordinal() != 0"; + int max = 0; + for (ProfilingEvent event : ProfilingEvent.values()) { + max = Math.max(max, event.name().length()); + } + TABLE_COLUMN_WIDTH = max + 1; + } + + public KernelDeviceProfile(Class<? extends Kernel> kernel, Device device) { + this.kernel = kernel; + this.device = device; + this.format = (DecimalFormat) DecimalFormat.getNumberInstance(); + format.setMinimumFractionDigits(3); + format.setMaximumFractionDigits(3); + } + + public void onEvent(ProfilingEvent event) { + if (event == ProfilingEvent.START) { + if (lastEvent != null) { + logger.log(Level.SEVERE, "ProfilingEvent.START encountered without ProfilingEvent.EXECUTED"); + } else if (lastEvent == ProfilingEvent.START) { + logger.log(Level.SEVERE, "Duplicate event ProfilingEvent.START"); + } + Arrays.fill(currentTimes, 0L); + ++invocationCount; + } else { + if (lastEvent == null) { + if (event != ProfilingEvent.EXECUTED) { + logger.log(Level.SEVERE, "ProfilingEvent.START was not invoked prior to ProfilingEvent." + event); + } + } else { + for (int i = lastEvent.ordinal() + 1; i < event.ordinal(); ++i) { + currentTimes[i] = currentTimes[i - 1]; + } + } + } + currentTimes[event.ordinal()] = System.nanoTime(); + if (event == ProfilingEvent.EXECUTED) { + for (int i = 1; i < currentTimes.length; ++i) { + long elapsed = currentTimes[i] - currentTimes[i - 1]; + if (elapsed < 0) { + logger.log(Level.SEVERE, "negative elapsed time for event " + event); + break; + } + accumulatedTimes[i] += elapsed; + } + } + lastEvent = event; + if (event == ProfilingEvent.EXECUTED) { + lastEvent = null; + } + } + + /** Elapsed time for a single event only, i.e. since the previous stage rather than from the start. */ + public double getLastElapsedTime(ProfilingEvent stage) { + if (stage == ProfilingEvent.START) { + return 0; + } + return (currentTimes[stage.ordinal()] - currentTimes[stage.ordinal() - 1]) / MILLION; + } + + /** Elapsed time for all events {@code from} through {@code to}.*/ + public double getLastElapsedTime(ProfilingEvent from, ProfilingEvent to) { + return (currentTimes[to.ordinal()] - currentTimes[from.ordinal()]) / MILLION; + } + + /** Elapsed time for a single event only, i.e. since the previous stage rather than from the start, summed over all executions. */ + public double getCumulativeElapsedTime(ProfilingEvent stage) { + return (accumulatedTimes[stage.ordinal()]) / MILLION; + } + + /** Elapsed time of entire execution, summed over all executions. */ + public double getCumulativeElapsedTimeAll() { + double sum = 0; + for (int i = 1; i <= ProfilingEvent.EXECUTED.ordinal(); ++i) { + sum += accumulatedTimes[i]; + } + return sum; + } + + public static String getTableHeader() { + int length = ProfilingEvent.values().length; + StringBuilder builder = new StringBuilder(150); + appendRowHeaders(builder, "Device", "Count"); + for (int i = 1; i < length; ++i) { + ProfilingEvent stage = ProfilingEvent.values()[i]; + String heading = stage.name(); + appendCell(builder, heading); + } + builder.append(" ").append("Total"); + return builder.toString(); + } + + public String getLastAsTableRow() { + double total = 0; + StringBuilder builder = new StringBuilder(150); + appendRowHeaders(builder, device.getShortDescription(), String.valueOf(invocationCount)); + for (int i = 1; i < currentTimes.length; ++i) { + ProfilingEvent stage = ProfilingEvent.values()[i]; + double time = getLastElapsedTime(stage); + total += time; + String formatted = format.format(time); + appendCell(builder, formatted); + } + builder.append(" ").append(format.format(total)); + return builder.toString(); + } + + public String getCumulativeAsTableRow() { + return internalCumulativeAsTableRow(false); + } + + public String getAverageAsTableRow() { + return internalCumulativeAsTableRow(true); + } + + private String internalCumulativeAsTableRow(boolean mean) { + double total = 0; + double count = mean ? invocationCount : 1; + StringBuilder builder = new StringBuilder(150); + appendRowHeaders(builder, device.getShortDescription(), String.valueOf(invocationCount)); + for (int i = 1; i < currentTimes.length; ++i) { + ProfilingEvent stage = ProfilingEvent.values()[i]; + double time = getCumulativeElapsedTime(stage); + if (mean) { + time /= count; + } + total += time; + String formatted = format.format(time); + appendCell(builder, formatted); + } + builder.append(" ").append(format.format(total)); + return builder.toString(); + } + + private static void appendRowHeaders(StringBuilder builder, String device, String count) { + if (device.length() > TABLE_COLUMN_HEADER_WIDTH - 1) { + device = device.substring(0, TABLE_COLUMN_HEADER_WIDTH - 1); + } + builder.append(device); + int padding = TABLE_COLUMN_HEADER_WIDTH - device.length(); + for (int i = 0; i < padding; ++i) { + builder.append(' '); + } + + builder.append(count); + padding = TABLE_COLUMN_COUNT_WIDTH - count.length(); + for (int i = 0; i < padding; ++i) { + builder.append(' '); + } + } + + private static void appendCell(StringBuilder builder, String cell) { + int padding = TABLE_COLUMN_WIDTH - cell.length(); + for (int paddingIndex = 0; paddingIndex < padding; ++paddingIndex) { + builder.append(' '); + } + builder.append(cell); + } + + @Override + public String toString() { + return "KernelDeviceProfile{" + kernel.toString() + ", " + device.getShortDescription() + "}"; + } +} diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java new file mode 100644 index 0000000000000000000000000000000000000000..c1f29cbc51c95ed04011346b978f5a1bfd635293 --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java @@ -0,0 +1,300 @@ +package com.amd.aparapi.internal.kernel; + +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.util.*; + +import java.lang.reflect.*; +import java.util.*; + +/** + * Created by Barney on 24/08/2015. + */ +public class KernelManager { + + private static KernelManager INSTANCE = new KernelManager(); + private LinkedHashMap<Class<? extends Kernel>, KernelPreferences> preferences = new LinkedHashMap<>(); + private LinkedHashMap<Class<? extends Kernel>, KernelProfile> profiles = new LinkedHashMap<>(); + private LinkedHashMap<Class<? extends Kernel>, Kernel> sharedInstances = new LinkedHashMap<>(); + + private KernelPreferences defaultPreferences; + + protected KernelManager() { + defaultPreferences = createDefaultPreferences(); + } + + public static KernelManager instance() { + return INSTANCE; + } + + public static void setKernelManager(KernelManager manager) { + INSTANCE = manager; + } + + static { + if (Config.dumpProfilesOnExit) { + Runtime.getRuntime().addShutdownHook(new Thread() { + @Override + public void run() { + StringBuilder builder = new StringBuilder(2048); + instance().reportProfilingSummary(builder); + System.out.println(builder); + } + }); + } + } + + /** This method returns a shared instance of a given Kernel subclass. The kernelClass needs a no-args constructor, which + * need not be public. + * + * <p>Given that compilation of OpenCL is relatively expensive and that (currently!) there is no caching of compiled OpenCL + * it is desirable to only ever create one instance of any given Kernel subclass, which this method facilitates.</p> + * + * <p>In order to maintain thread saftey, it is necessary to synchronize on the returned kernel for the duration of the process of setting up, + * executing and extracting the results from that kernel, when using a shared instance.</p> + * + * @throws RuntimeException if the class cannot be instantiated + */ + public static <T extends Kernel> T sharedKernelInstance(Class<T> kernelClass) { + return instance().getSharedKernelInstance(kernelClass); + } + + /** Append a report to {@code builder} which contains information, per Kernel subclass, on which device is currently being used for the + * kernel class, and which (if any) devices failed to execute a given Kernel. + */ + public void reportDeviceUsage(StringBuilder builder, boolean withProfilingInfo) { + builder.append("Device Usage by Kernel Subclass"); + if (withProfilingInfo) { + builder.append(" (showing mean elapsed times in milliseconds)"); + } + builder.append("\n\n"); + for (Class<? extends Kernel> klass : preferences.keySet()) { + KernelPreferences preferences = this.preferences.get(klass); + KernelProfile profile = withProfilingInfo ? profiles.get(klass) : null; + builder.append(klass.getName()).append(":\n\tusing ").append(preferences.getPreferredDevice(null).getShortDescription()); + List<Device> failedDevices = preferences.getFailedDevices(); + if (failedDevices.size() > 0) { + builder.append(", failed devices = "); + for (int i = 0; i < failedDevices.size(); ++i) { + builder.append(failedDevices.get(i).getShortDescription()); + if (i < failedDevices.size() - 1) { + builder.append(" | "); + } + } + } + if (profile != null) { + builder.append("\n"); + int row = 0; + for (KernelDeviceProfile deviceProfile : profile.getDeviceProfiles()) { + if (row == 0) { + builder.append(deviceProfile.getTableHeader()).append("\n"); + } + builder.append(deviceProfile.getAverageAsTableRow()).append("\n"); + ++row; + } + } + builder.append("\n"); + } + } + + public void reportProfilingSummary(StringBuilder builder) { + builder.append("\nProfiles by Kernel Subclass (mean elapsed times in milliseconds)\n\n"); + builder.append(KernelDeviceProfile.getTableHeader()).append("\n"); + for (Class<? extends Kernel> kernelClass : profiles.keySet()) { + String simpleName = Reflection.getSimpleName(kernelClass); + String kernelName = "----------------- [[ " + simpleName + " ]] "; + builder.append(kernelName); + int dashes = 132 - kernelName.length(); + for (int i = 0; i < dashes; ++i) { + builder.append('-'); + } + builder.append("\n"); + KernelProfile kernelProfile = profiles.get(kernelClass); + for (KernelDeviceProfile deviceProfile : kernelProfile.getDeviceProfiles()) { + builder.append(deviceProfile.getAverageAsTableRow()).append("\n"); + } + } + } + + + public KernelPreferences getPreferences(Kernel kernel) { + synchronized (preferences) { + KernelPreferences kernelPreferences = preferences.get(kernel.getClass()); + if (kernelPreferences == null) { + kernelPreferences = new KernelPreferences(this, kernel.getClass()); + preferences.put(kernel.getClass(), kernelPreferences); + } + return kernelPreferences; + } + } + + public void setPreferredDevices(Kernel _kernel, LinkedHashSet<Device> _devices) { + KernelPreferences kernelPreferences = getPreferences(_kernel); + kernelPreferences.setPreferredDevices(_devices); + } + + public KernelPreferences getDefaultPreferences() { + return defaultPreferences; + } + + public void setDefaultPreferredDevices(LinkedHashSet<Device> _devices) { + defaultPreferences.setPreferredDevices(_devices); + } + + protected KernelPreferences createDefaultPreferences() { + KernelPreferences preferences = new KernelPreferences(this, null); + preferences.setPreferredDevices(createDefaultPreferredDevices()); + return preferences; + } + + private <T extends Kernel> T getSharedKernelInstance(Class<T> kernelClass) { + synchronized (sharedInstances) { + T shared = (T) sharedInstances.get(kernelClass); + if (shared == null) { + try { + Constructor<T> constructor = kernelClass.getConstructor(); + constructor.setAccessible(true); + shared = constructor.newInstance(); + sharedInstances.put(kernelClass, shared); + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + return shared; + } + } + + protected LinkedHashSet<Device> createDefaultPreferredDevices() { + LinkedHashSet<Device> devices = new LinkedHashSet<>(); + + List<OpenCLDevice> accelerators = OpenCLDevice.listDevices(Device.TYPE.ACC); + List<OpenCLDevice> gpus = OpenCLDevice.listDevices(Device.TYPE.GPU); + List<OpenCLDevice> cpus = OpenCLDevice.listDevices(Device.TYPE.CPU); + + Collections.sort(accelerators, getDefaultAcceleratorComparator()); + Collections.sort(gpus, getDefaultGPUComparator()); + + List<Device.TYPE> preferredDeviceTypes = getPreferredDeviceTypes(); + + for (Device.TYPE type : preferredDeviceTypes) { + switch (type) { + case UNKNOWN: + throw new AssertionError("UNKNOWN device type not supported"); + case GPU: + devices.addAll(gpus); + break; + case CPU: + devices.add(cpus.get(0)); + break; + case JTP: + devices.add(JavaDevice.THREAD_POOL); + break; + case SEQ: + devices.add(JavaDevice.SEQUENTIAL); + break; + case ACC: + devices.addAll(accelerators); + break; + case ALT: + devices.add(JavaDevice.ALTERNATIVE_ALGORITHM); + break; + } + } + + return devices; + } + + protected List<Device.TYPE> getPreferredDeviceTypes() { + return Arrays.asList(Device.TYPE.ACC, Device.TYPE.GPU, Device.TYPE.CPU, Device.TYPE.ALT, Device.TYPE.JTP); + } + + /** NB, returns -ve for the better device. */ + protected Comparator<OpenCLDevice> getDefaultAcceleratorComparator() { + return new Comparator<OpenCLDevice>() { + @Override + public int compare(OpenCLDevice left, OpenCLDevice right) { + return (right.getMaxComputeUnits() - left.getMaxComputeUnits()); + } + }; + } + + /** NB, returns -ve for the better device. */ + protected Comparator<OpenCLDevice> getDefaultGPUComparator() { + return new Comparator<OpenCLDevice>() { + @Override + public int compare(OpenCLDevice left, OpenCLDevice right) { + return selectLhs(left, right) ? -1 : 1; + } + }; + } + + public Device bestDevice() { + return getDefaultPreferences().getPreferredDevice(null); + } + + protected static boolean selectLhs(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) { + boolean nvidiaLhs = _deviceLhs.getOpenCLPlatform().getVendor().toLowerCase().contains("nvidia"); + boolean nvidiaRhs = _deviceRhs.getOpenCLPlatform().getVendor().toLowerCase().contains("nvidia"); + if (nvidiaLhs || nvidiaRhs) { + return selectLhsIfCUDA(_deviceLhs, _deviceRhs); + } + return _deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits(); + } + + /** NVidia/CUDA architecture reports maxComputeUnits in a completely different context, i.e. maxComputeUnits is not same as + * (is much less than) the number of OpenCL cores available. + * + * <p>Therefore when comparing an NVidia device we use different criteria.</p> + */ + protected static boolean selectLhsIfCUDA(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) { + if (_deviceLhs.getType() != _deviceRhs.getType()) { + return selectLhsByType(_deviceLhs.getType(), _deviceRhs.getType()); + } + return _deviceLhs.getMaxWorkGroupSize() == _deviceRhs.getMaxWorkGroupSize() + ? _deviceLhs.getGlobalMemSize() > _deviceRhs.getGlobalMemSize() + : _deviceLhs.getMaxWorkGroupSize() > _deviceRhs.getMaxWorkGroupSize(); + } + + private static boolean selectLhsByType(Device.TYPE lhs, Device.TYPE rhs) { + return lhs.rank < rhs.rank; + } + + public KernelProfile getProfile(Class<? extends Kernel> kernelClass) { + synchronized (profiles) { + KernelProfile profile = profiles.get(kernelClass); + if (profile == null) { + profile = new KernelProfile(kernelClass); + profiles.put(kernelClass, profile); + } + return profile; + } + } + + /** New home for deprecated methods of {@link Device}. */ + public static class DeprecatedMethods { + + @Deprecated + public static Device firstDevice(Device.TYPE _type) { + List<Device> devices = instance().getDefaultPreferences().getPreferredDevices(null); + for (Device device : devices) { + if(device.getType() == _type) { + return device; + } + } + return null; + } + + @SuppressWarnings("deprecation") + @Deprecated + public static Device bestGPU() { + return firstDevice(Device.TYPE.GPU); + } + + @SuppressWarnings("deprecation") + @Deprecated + public static Device bestACC() { + return firstDevice(Device.TYPE.ACC); + } + } +} diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java new file mode 100644 index 0000000000000000000000000000000000000000..8a31cd70be8d93895f8254b66a2f266f4bc164c6 --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java @@ -0,0 +1,31 @@ +package com.amd.aparapi.internal.kernel; + +import com.amd.aparapi.device.*; + +import java.util.*; + +/** + * KernelManager instances useful for debugging. + */ +public class KernelManagers { + + public static final KernelManager JTP_ONLY = new KernelManager() { + + private List<Device.TYPE> types = Collections.singletonList(Device.TYPE.JTP); + + @Override + protected List<Device.TYPE> getPreferredDeviceTypes() { + return types; + } + }; + + public static final KernelManager SEQUENTIAL_ONLY = new KernelManager() { + + private final List<Device.TYPE> types = Collections.singletonList(Device.TYPE.SEQ); + + @Override + protected List<Device.TYPE> getPreferredDeviceTypes() { + return types; + } + }; +} diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java new file mode 100644 index 0000000000000000000000000000000000000000..17e479a85fcfb7a9874943ade1d0c1cc042b674a --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java @@ -0,0 +1,103 @@ +package com.amd.aparapi.internal.kernel; + +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; + +import java.util.*; +import java.util.logging.*; + +/** + * Collects profiling information per kernel class per device. Not thread safe, it is necessary for client code to correctly synchronize on + * objects of this class. + */ +public class KernelProfile { + + private static Logger logger = Logger.getLogger(Config.getLoggerName()); + private final Class<? extends Kernel> kernelClass; + private LinkedHashMap<Device, KernelDeviceProfile> deviceProfiles = new LinkedHashMap<>(); + private Device currentDevice; + private Device lastDevice; + private KernelDeviceProfile currentDeviceProfile; + + public KernelProfile(Class<? extends Kernel> _kernelClass) { + kernelClass = _kernelClass; + } + + public double getLastExecutionTime() { + KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile(); + return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED); + } + + public double getLastConversionTime() { + KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile(); + return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED); } + + public double getAccumulatedTotalTime() { + KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile(); + if (lastDeviceProfile == null) { + return Double.NaN; + } + else { + return lastDeviceProfile.getCumulativeElapsedTimeAll(); + } + } + + private KernelDeviceProfile getLastDeviceProfile() { + return null; + } + + void onStart(Device device) { + currentDevice = device; + synchronized (deviceProfiles) { + currentDeviceProfile = deviceProfiles.get(device); + if (currentDeviceProfile == null) { + currentDeviceProfile = new KernelDeviceProfile(kernelClass, device); + deviceProfiles.put(device, currentDeviceProfile); + } + } + currentDeviceProfile.onEvent(ProfilingEvent.START); + } + + void onEvent(ProfilingEvent event) { + switch (event) { + case CLASS_MODEL_BUILT: // fallthrough + case OPENCL_GENERATED: // fallthrough + case OPENCL_COMPILED: // fallthrough + case PREPARE_EXECUTE: // fallthrough + case EXECUTED: // fallthrough + { + if (currentDeviceProfile == null) { + logger.log(Level.SEVERE, "Error in KernelProfile, no currentDevice (synchronization error?"); + } + currentDeviceProfile.onEvent(event); + break; + } + case START: + throw new IllegalArgumentException("must use onStart(Device) to start profiling"); + default: + throw new IllegalArgumentException("Unhandled event " + event); + } + } + + void onFinishedExecution() { + reset(); + } + + private void reset() { + lastDevice = currentDevice; + currentDevice = null; + currentDeviceProfile = null; + } + + public Collection<Device> getDevices() { + return deviceProfiles.keySet(); + } + + public Collection<KernelDeviceProfile> getDeviceProfiles() { + return deviceProfiles.values(); + } + + public KernelDeviceProfile getDeviceProfile(Device device) { + return deviceProfiles.get(device); + } +} diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java index c2b69e44f1cb3564fa00d82c632d861c5ae93986..f162d695ed5130737b525cdbc707f49b41d56b30 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java @@ -37,45 +37,25 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit */ package com.amd.aparapi.internal.kernel; -import com.amd.aparapi.Config; -import com.amd.aparapi.Kernel; +import com.amd.aparapi.*; import com.amd.aparapi.Kernel.Constant; -import com.amd.aparapi.Kernel.EXECUTION_MODE; -import com.amd.aparapi.Kernel.KernelState; -import com.amd.aparapi.Kernel.Local; -import com.amd.aparapi.ProfileInfo; -import com.amd.aparapi.Range; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.device.OpenCLDevice; -import com.amd.aparapi.internal.annotation.UsedByJNICode; -import com.amd.aparapi.internal.exception.AparapiException; -import com.amd.aparapi.internal.exception.CodeGenException; -import com.amd.aparapi.internal.instruction.InstructionSet.TypeSpec; -import com.amd.aparapi.internal.jni.KernelRunnerJNI; -import com.amd.aparapi.internal.model.ClassModel; -import com.amd.aparapi.internal.model.Entrypoint; -import com.amd.aparapi.internal.util.UnsafeWrapper; -import com.amd.aparapi.internal.writer.KernelWriter; -import com.amd.aparapi.opencl.OpenCL; - -import java.lang.reflect.Array; -import java.lang.reflect.Field; -import java.lang.reflect.Modifier; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.IntBuffer; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.StringTokenizer; -import java.util.concurrent.BrokenBarrierException; -import java.util.concurrent.CyclicBarrier; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.ForkJoinPool.ForkJoinWorkerThreadFactory; -import java.util.concurrent.ForkJoinPool.ManagedBlocker; -import java.util.concurrent.ForkJoinWorkerThread; -import java.util.logging.Level; -import java.util.logging.Logger; +import com.amd.aparapi.Kernel.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.annotation.*; +import com.amd.aparapi.internal.exception.*; +import com.amd.aparapi.internal.instruction.InstructionSet.*; +import com.amd.aparapi.internal.jni.*; +import com.amd.aparapi.internal.model.*; +import com.amd.aparapi.internal.util.*; +import com.amd.aparapi.internal.writer.*; +import com.amd.aparapi.opencl.*; + +import java.lang.reflect.*; +import java.nio.*; +import java.util.*; +import java.util.concurrent.*; +import java.util.concurrent.ForkJoinPool.*; +import java.util.logging.*; /** * The class is responsible for executing <code>Kernel</code> implementations. <br/> @@ -101,6 +81,7 @@ public class KernelRunner extends KernelRunnerJNI{ @UsedByJNICode public static final int PASS_ID_COMPLETED_EXECUTION = -1; @UsedByJNICode public static final int CANCEL_STATUS_FALSE = 0; @UsedByJNICode public static final int CANCEL_STATUS_TRUE = 1; + private static final String CODE_GEN_ERROR_MARKER = CodeGenException.class.getName(); private static Logger logger = Logger.getLogger(Config.getLoggerName()); @@ -147,6 +128,7 @@ public class KernelRunner extends KernelRunnerJNI{ private static final ForkJoinPool threadPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(), lowPriorityThreadFactory, null, false); + private static HashMap<Class<? extends Kernel>, String> openCLCache = new HashMap<>(); /** * Create a KernelRunner for a specific Kernel instance. @@ -164,6 +146,8 @@ public class KernelRunner extends KernelRunnerJNI{ inBufferRemoteInt = inBufferRemote.asIntBuffer(); outBufferRemoteInt = outBufferRemote.asIntBuffer(); + + KernelManager.instance(); // ensures static initialization of KernalManager } /** @@ -172,7 +156,7 @@ public class KernelRunner extends KernelRunnerJNI{ * @see KernelRunnerJNI#disposeJNI(long) */ public void dispose() { - if (kernel.getExecutionMode().isOpenCL()) { + if (kernel.isRunningCL()) { disposeJNI(jniContextHandle); } // We are using a shared pool, so there's no need no shutdown it when kernel is disposed @@ -181,12 +165,6 @@ public class KernelRunner extends KernelRunnerJNI{ private Set<String> capabilitiesSet; - private long accumulatedExecutionTime = 0; - - private long conversionTime = 0; - - private long executionTime = 0; - boolean hasFP64Support() { if (capabilitiesSet == null) { throw new IllegalStateException("Capabilities queried before they were initialized"); @@ -316,312 +294,334 @@ public class KernelRunner extends KernelRunnerJNI{ } /** - * Execute using a Java thread pool. Either because we were explicitly asked to do so, or because we 'fall back' after discovering an OpenCL issue. - * - * @param _range - * The globalSize requested by the user (via <code>Kernel.execute(globalSize)</code>) - * @param _passes - * The # of passes requested by the user (via <code>Kernel.execute(globalSize, passes)</code>). Note this is usually defaulted to 1 via <code>Kernel.execute(globalSize)</code>. - * @return + * Execute using a Java thread pool, or sequentially, or using an alternative algorithm, usually as a result of failing to compile or execute OpenCL */ - protected long executeJava(final Range _range, final int _passes) { + @SuppressWarnings("deprecation") + protected void executeJava(ExecutionSettings _settings, Device device) { if (logger.isLoggable(Level.FINE)) { - logger.fine("executeJava: range = " + _range); + logger.fine("executeJava: range = " + _settings.range + ", device = " + device); } + boolean legacySequentialMode = kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.SEQ); passId = PASS_ID_PREPARING_EXECUTION; + _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE); + try { - final int localSize0 = _range.getLocalSize(0); - final int localSize1 = _range.getLocalSize(1); - final int localSize2 = _range.getLocalSize(2); - final int globalSize1 = _range.getGlobalSize(1); - if (kernel.getExecutionMode().equals(EXECUTION_MODE.SEQ)) { - /** - * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the - * product of localSize(0..3) is >1. So we can use multi-dim ranges but only if the local size is 1 in all dimensions. - * - * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op. - * - * So we need to check if the range is valid here. If not we have no choice but to punt. - */ - if ((localSize0 * localSize1 * localSize2) > 1) { - throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!"); + if (device == JavaDevice.ALTERNATIVE_ALGORITHM) { + if (kernel.hasFallbackAlgorithm()) { + for (passId = 0; passId < _settings.passes; ++passId) { + kernel.executeFallbackAlgorithm(_settings.range, passId); + } + } else { + boolean silently = true; // not having an alternative algorithm is the normal state, and does not need reporting + fallBackToNextDevice(_settings, (Exception) null, silently); } - - final Kernel kernelClone = kernel.clone(); - final KernelState kernelState = kernelClone.getKernelState(); - - kernelState.setRange(_range); - kernelState.setGroupId(0, 0); - kernelState.setGroupId(1, 0); - kernelState.setGroupId(2, 0); - kernelState.setLocalId(0, 0); - kernelState.setLocalId(1, 0); - kernelState.setLocalId(2, 0); - kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1)); - - for (passId = 0; passId < _passes; passId++) { - if (getCancelState() == CANCEL_STATUS_TRUE) { - break; + } else { + final int localSize0 = _settings.range.getLocalSize(0); + final int localSize1 = _settings.range.getLocalSize(1); + final int localSize2 = _settings.range.getLocalSize(2); + final int globalSize1 = _settings.range.getGlobalSize(1); + if (legacySequentialMode || device == JavaDevice.SEQUENTIAL) { + /** + * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the + * product of localSize(0..3) is >1. So we can use multi-dim ranges but only if the local size is 1 in all dimensions. + * + * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op. + * + * So we need to check if the range is valid here. If not we have no choice but to punt. + */ + if ((localSize0 * localSize1 * localSize2) > 1) { + throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!"); } - kernelState.setPassId(passId); - if (_range.getDims() == 1) { - for (int id = 0; id < _range.getGlobalSize(0); id++) { - kernelState.setGlobalId(0, id); - kernelClone.run(); + final Kernel kernelClone = kernel.clone(); + final KernelState kernelState = kernelClone.getKernelState(); + + kernelState.setRange(_settings.range); + kernelState.setGroupId(0, 0); + kernelState.setGroupId(1, 0); + kernelState.setGroupId(2, 0); + kernelState.setLocalId(0, 0); + kernelState.setLocalId(1, 0); + kernelState.setLocalId(2, 0); + kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1)); + + for (passId = 0; passId < _settings.passes; passId++) { + if (getCancelState() == CANCEL_STATUS_TRUE) { + break; } - } else if (_range.getDims() == 2) { - for (int x = 0; x < _range.getGlobalSize(0); x++) { - kernelState.setGlobalId(0, x); + kernelState.setPassId(passId); - for (int y = 0; y < globalSize1; y++) { - kernelState.setGlobalId(1, y); + if (_settings.range.getDims() == 1) { + for (int id = 0; id < _settings.range.getGlobalSize(0); id++) { + kernelState.setGlobalId(0, id); kernelClone.run(); } } - } else if (_range.getDims() == 3) { - for (int x = 0; x < _range.getGlobalSize(0); x++) { - kernelState.setGlobalId(0, x); - - for (int y = 0; y < globalSize1; y++) { - kernelState.setGlobalId(1, y); + else if (_settings.range.getDims() == 2) { + for (int x = 0; x < _settings.range.getGlobalSize(0); x++) { + kernelState.setGlobalId(0, x); - for (int z = 0; z < _range.getGlobalSize(2); z++) { - kernelState.setGlobalId(2, z); + for (int y = 0; y < globalSize1; y++) { + kernelState.setGlobalId(1, y); kernelClone.run(); } + } + } + else if (_settings.range.getDims() == 3) { + for (int x = 0; x < _settings.range.getGlobalSize(0); x++) { + kernelState.setGlobalId(0, x); - kernelClone.run(); + for (int y = 0; y < globalSize1; y++) { + kernelState.setGlobalId(1, y); + + for (int z = 0; z < _settings.range.getGlobalSize(2); z++) { + kernelState.setGlobalId(2, z); + kernelClone.run(); + } + + kernelClone.run(); + } } } } + passId = PASS_ID_COMPLETED_EXECUTION; } - passId = PASS_ID_COMPLETED_EXECUTION; - } else { - final int threads = localSize0 * localSize1 * localSize2; - final int numGroups0 = _range.getNumGroups(0); - final int numGroups1 = _range.getNumGroups(1); - final int globalGroups = numGroups0 * numGroups1 * _range.getNumGroups(2); - /** - * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread. - * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread) - */ - final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1); - - /** - * This localBarrier is only ever used by the kernels. If the kernel does not use the barrier the threads - * can get out of sync, we promised nothing in JTP mode. - * - * As with OpenCL all threads within a group must wait at the barrier or none. It is a user error (possible deadlock!) - * if the barrier is in a conditional that is only executed by some of the threads within a group. - * - * Kernel developer must understand this. - * - * This barrier is threadCount wide. We never hit the barrier from the dispatch thread. - */ - final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads); - - final ThreadIdSetter threadIdSetter; - - if (_range.getDims() == 1) { - threadIdSetter = new ThreadIdSetter(){ - @Override public void set(KernelState kernelState, int globalGroupId, int threadId) { - // (kernelState, globalGroupId, threadId) ->{ - kernelState.setLocalId(0, (threadId % localSize0)); - kernelState.setGlobalId(0, (threadId + (globalGroupId * threads))); - kernelState.setGroupId(0, globalGroupId); - } - }; - } else if (_range.getDims() == 2) { + else { + if (device != JavaDevice.THREAD_POOL && kernel.getExecutionMode() != Kernel.EXECUTION_MODE.JTP) { + throw new AssertionError("unexpected JavaDevice or EXECUTION_MODE"); + } + final int threads = localSize0 * localSize1 * localSize2; + final int numGroups0 = _settings.range.getNumGroups(0); + final int numGroups1 = _settings.range.getNumGroups(1); + final int globalGroups = numGroups0 * numGroups1 * _settings.range.getNumGroups(2); + /** + * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread. + * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread) + */ + final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1); /** - * Consider a 12x4 grid of 4*2 local groups - * <pre> - * threads = 4*2 = 8 - * localWidth=4 - * localHeight=2 - * globalWidth=12 - * globalHeight=4 - * - * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 - * 12 13 14 15 | 16 17 18 19 | 20 21 22 23 - * ------------+-------------+------------ - * 24 25 26 27 | 28 29 30 31 | 32 33 34 35 - * 36 37 38 39 | 40 41 42 43 | 44 45 46 47 + * This localBarrier is only ever used by the kernels. If the kernel does not use the barrier the threads + * can get out of sync, we promised nothing in JTP mode. * - * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 threadIds : [0..7]*6 - * 04 05 06 07 | 04 05 06 07 | 04 05 06 07 - * ------------+-------------+------------ - * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 - * 04 05 06 07 | 04 05 06 07 | 04 05 06 07 + * As with OpenCL all threads within a group must wait at the barrier or none. It is a user error (possible deadlock!) + * if the barrier is in a conditional that is only executed by some of the threads within a group. * - * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 groupId[0] : 0..6 - * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 - * ------------+-------------+------------ - * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 - * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 - * - * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 groupId[1] : 0..6 - * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 - * ------------+-------------+------------ - * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 - * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 - * - * 00 01 02 03 | 08 09 10 11 | 16 17 18 19 globalThreadIds == threadId + groupId * threads; - * 04 05 06 07 | 12 13 14 15 | 20 21 22 23 - * ------------+-------------+------------ - * 24 25 26 27 | 32[33]34 35 | 40 41 42 43 - * 28 29 30 31 | 36 37 38 39 | 44 45 46 47 - * - * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1) - * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 - * ------------+-------------+------------ - * 00 01 02 03 | 00[01]02 03 | 00 01 02 03 - * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 - * - * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 localY = threadId /localWidth (for globalThreadId 33 = threadId = 01 : 01/4 =0) - * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 - * ------------+-------------+------------ - * 00 00 00 00 | 00[00]00 00 | 00 00 00 00 - * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 - * - * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 globalX= - * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 groupsPerLineWidth=globalWidth/localWidth (=12/4 =3) - * ------------+-------------+------------ groupInset =groupId%groupsPerLineWidth (=4%3 = 1) - * 00 01 02 03 | 04[05]06 07 | 08 09 10 11 - * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 globalX = groupInset*localWidth+localX (= 1*4+1 = 5) - * - * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 globalY - * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 - * ------------+-------------+------------ - * 02 02 02 02 | 02[02]02 02 | 02 02 02 02 - * 03 03 03 03 | 03 03 03 03 | 03 03 03 03 - * - * </pre> - * Assume we are trying to locate the id's for #33 + * Kernel developer must understand this. * + * This barrier is threadCount wide. We never hit the barrier from the dispatch thread. */ - threadIdSetter = new ThreadIdSetter(){ - @Override public void set(KernelState kernelState, int globalGroupId, int threadId) { - // (kernelState, globalGroupId, threadId) ->{ - kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth = (for 33 = 1 % 4 = 1) - kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0) - - final int groupInset = globalGroupId % numGroups0; // 4%3 = 1 - kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5 - - final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2 - kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2 - kernelState.setGroupId(0, (globalGroupId % numGroups0)); - kernelState.setGroupId(1, (globalGroupId / numGroups0)); - } - }; - } else if (_range.getDims() == 3) { - //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code - threadIdSetter = new ThreadIdSetter(){ - @Override public void set(KernelState kernelState, int globalGroupId, int threadId) { - // (kernelState, globalGroupId, threadId) ->{ - kernelState.setLocalId(0, (threadId % localSize0)); + final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads); + + final ThreadIdSetter threadIdSetter; + + if (_settings.range.getDims() == 1) { + threadIdSetter = new ThreadIdSetter() { + @Override + public void set(KernelState kernelState, int globalGroupId, int threadId) { + // (kernelState, globalGroupId, threadId) ->{ + kernelState.setLocalId(0, (threadId % localSize0)); + kernelState.setGlobalId(0, (threadId + (globalGroupId * threads))); + kernelState.setGroupId(0, globalGroupId); + } + }; + } + else if (_settings.range.getDims() == 2) { + + /** + * Consider a 12x4 grid of 4*2 local groups + * <pre> + * threads = 4*2 = 8 + * localWidth=4 + * localHeight=2 + * globalWidth=12 + * globalHeight=4 + * + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 + * 12 13 14 15 | 16 17 18 19 | 20 21 22 23 + * ------------+-------------+------------ + * 24 25 26 27 | 28 29 30 31 | 32 33 34 35 + * 36 37 38 39 | 40 41 42 43 | 44 45 46 47 + * + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 threadIds : [0..7]*6 + * 04 05 06 07 | 04 05 06 07 | 04 05 06 07 + * ------------+-------------+------------ + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * 04 05 06 07 | 04 05 06 07 | 04 05 06 07 + * + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 groupId[0] : 0..6 + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * ------------+-------------+------------ + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 groupId[1] : 0..6 + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 + * ------------+-------------+------------ + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * + * 00 01 02 03 | 08 09 10 11 | 16 17 18 19 globalThreadIds == threadId + groupId * threads; + * 04 05 06 07 | 12 13 14 15 | 20 21 22 23 + * ------------+-------------+------------ + * 24 25 26 27 | 32[33]34 35 | 40 41 42 43 + * 28 29 30 31 | 36 37 38 39 | 44 45 46 47 + * + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1) + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * ------------+-------------+------------ + * 00 01 02 03 | 00[01]02 03 | 00 01 02 03 + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 localY = threadId /localWidth (for globalThreadId 33 = threadId = 01 : 01/4 =0) + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * ------------+-------------+------------ + * 00 00 00 00 | 00[00]00 00 | 00 00 00 00 + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 globalX= + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 groupsPerLineWidth=globalWidth/localWidth (=12/4 =3) + * ------------+-------------+------------ groupInset =groupId%groupsPerLineWidth (=4%3 = 1) + * 00 01 02 03 | 04[05]06 07 | 08 09 10 11 + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 globalX = groupInset*localWidth+localX (= 1*4+1 = 5) + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 globalY + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * ------------+-------------+------------ + * 02 02 02 02 | 02[02]02 02 | 02 02 02 02 + * 03 03 03 03 | 03 03 03 03 | 03 03 03 03 + * + * </pre> + * Assume we are trying to locate the id's for #33 + * + */ + threadIdSetter = new ThreadIdSetter() { + @Override + public void set(KernelState kernelState, int globalGroupId, int threadId) { + // (kernelState, globalGroupId, threadId) ->{ + kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth = (for 33 = 1 % 4 = 1) + kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0) + + final int groupInset = globalGroupId % numGroups0; // 4%3 = 1 + kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5 + + final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2 + kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2 + kernelState.setGroupId(0, (globalGroupId % numGroups0)); + kernelState.setGroupId(1, (globalGroupId / numGroups0)); + } + }; + } + else if (_settings.range.getDims() == 3) { + //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code + threadIdSetter = new ThreadIdSetter() { + @Override + public void set(KernelState kernelState, int globalGroupId, int threadId) { + // (kernelState, globalGroupId, threadId) ->{ + kernelState.setLocalId(0, (threadId % localSize0)); - kernelState.setLocalId(1, ((threadId / localSize0) % localSize1)); + kernelState.setLocalId(1, ((threadId / localSize0) % localSize1)); - // the thread id's span WxHxD so threadId/(WxH) should yield the local depth - kernelState.setLocalId(2, (threadId / (localSize0 * localSize1))); + // the thread id's span WxHxD so threadId/(WxH) should yield the local depth + kernelState.setLocalId(2, (threadId / (localSize0 * localSize1))); - kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0])); + kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0])); - kernelState.setGlobalId(1, - ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1])); + kernelState.setGlobalId(1, + ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1])); - kernelState.setGlobalId(2, - (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2])); + kernelState.setGlobalId(2, + (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2])); - kernelState.setGroupId(0, (globalGroupId % numGroups0)); - kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1)); - kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1))); - } - }; - } else - throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _range.getDims()); - for (passId = 0; passId < _passes; passId++) { - if (getCancelState() == CANCEL_STATUS_TRUE) { - break; + kernelState.setGroupId(0, (globalGroupId % numGroups0)); + kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1)); + kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1))); + } + }; } - /** - * Note that we emulate OpenCL by creating one thread per localId (across the group). - * - * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2); - * - * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0). - * - * We also clone the kernel 4 times. One per thread. - * - * We create local barrier which has a width of 4 - * - * Thread-0 handles localId(0) (global 0,4,8) - * Thread-1 handles localId(1) (global 1,5,7) - * Thread-2 handles localId(2) (global 2,6,10) - * Thread-3 handles localId(3) (global 3,7,11) - * - * This allows all threads to synchronize using the local barrier. - * - * Initially the use of local buffers seems broken as the buffers appears to be per Kernel. - * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global) - * So each of the cloned kernels actually still reference the same underlying local/global buffers. - * - * If the kernel uses local buffers but does not use barriers then it is possible for different groups - * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it - * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong) - * - * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep. - * - **/ - for (int id = 0; id < threads; id++) { - final int threadId = id; - + else + throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _settings.range.getDims()); + for (passId = 0; passId < _settings.passes; passId++) { + if (getCancelState() == CANCEL_STATUS_TRUE) { + break; + } /** - * We clone one kernel for each thread. + * Note that we emulate OpenCL by creating one thread per localId (across the group). * - * They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow. - * We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying - * about other threads. - */ - final Kernel kernelClone = kernel.clone(); - final KernelState kernelState = kernelClone.getKernelState(); - kernelState.setRange(_range); - kernelState.setPassId(passId); - - if (threads == 1) { - kernelState.disableLocalBarrier(); - } else { - kernelState.setLocalBarrier(localBarrier); - } + * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2); + * + * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0). + * + * We also clone the kernel 4 times. One per thread. + * + * We create local barrier which has a width of 4 + * + * Thread-0 handles localId(0) (global 0,4,8) + * Thread-1 handles localId(1) (global 1,5,7) + * Thread-2 handles localId(2) (global 2,6,10) + * Thread-3 handles localId(3) (global 3,7,11) + * + * This allows all threads to synchronize using the local barrier. + * + * Initially the use of local buffers seems broken as the buffers appears to be per Kernel. + * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global) + * So each of the cloned kernels actually still reference the same underlying local/global buffers. + * + * If the kernel uses local buffers but does not use barriers then it is possible for different groups + * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it + * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong) + * + * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep. + * + **/ + for (int id = 0; id < threads; id++) { + final int threadId = id; + + /** + * We clone one kernel for each thread. + * + * They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow. + * We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying + * about other threads. + */ + final Kernel kernelClone = kernel.clone(); + final KernelState kernelState = kernelClone.getKernelState(); + kernelState.setRange(_settings.range); + kernelState.setPassId(passId); + + if (threads == 1) { + kernelState.disableLocalBarrier(); + } + else { + kernelState.setLocalBarrier(localBarrier); + } - threadPool.submit( - // () -> { - new Runnable(){ - public void run() { - try { - for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) { - threadIdSetter.set(kernelState, globalGroupId, threadId); - kernelClone.run(); - } - } catch (RuntimeException | Error e) { - logger.log(Level.SEVERE, "Execution failed", e); - } finally { - await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join. + threadPool.submit( + // () -> { + new Runnable() { + public void run() { + try { + for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) { + threadIdSetter.set(kernelState, globalGroupId, threadId); + kernelClone.run(); } } - }); - } - - await(joinBarrier); // This dispatch thread waits for all worker threads here. - } - passId = PASS_ID_COMPLETED_EXECUTION; - } // execution mode == JTP + catch (RuntimeException | Error e) { + logger.log(Level.SEVERE, "Execution failed", e); + } + finally { + await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join. + } + } + }); + } - return 0; + await(joinBarrier); // This dispatch thread waits for all worker threads here. + } + passId = PASS_ID_COMPLETED_EXECUTION; + } // execution mode == JTP + } } finally { passId = PASS_ID_COMPLETED_EXECUTION; } @@ -964,63 +964,22 @@ public class KernelRunner extends KernelRunnerJNI{ return needsSync; } - // private int numAvailableProcessors = Runtime.getRuntime().availableProcessors(); - - private Kernel executeOpenCL(final String _entrypointName, final Range _range, final int _passes) throws AparapiException { - /* - if (_range.getDims() > getMaxWorkItemDimensionsJNI(jniContextHandle)) { - throw new RangeException("Range dim size " + _range.getDims() + " > device " - + getMaxWorkItemDimensionsJNI(jniContextHandle)); - } - if (_range.getWorkGroupSize() > getMaxWorkGroupSizeJNI(jniContextHandle)) { - throw new RangeException("Range workgroup size " + _range.getWorkGroupSize() + " > device " - + getMaxWorkGroupSizeJNI(jniContextHandle)); - } - - if (_range.getGlobalSize(0) > getMaxWorkItemSizeJNI(jniContextHandle, 0)) { - throw new RangeException("Range globalsize 0 " + _range.getGlobalSize(0) + " > device " - + getMaxWorkItemSizeJNI(jniContextHandle, 0)); - } - if (_range.getDims() > 1) { - if (_range.getGlobalSize(1) > getMaxWorkItemSizeJNI(jniContextHandle, 1)) { - throw new RangeException("Range globalsize 1 " + _range.getGlobalSize(1) + " > device " - + getMaxWorkItemSizeJNI(jniContextHandle, 1)); - } - if (_range.getDims() > 2) { - if (_range.getGlobalSize(2) > getMaxWorkItemSizeJNI(jniContextHandle, 2)) { - throw new RangeException("Range globalsize 2 " + _range.getGlobalSize(2) + " > device " - + getMaxWorkItemSizeJNI(jniContextHandle, 2)); - } - } - } - + @SuppressWarnings("deprecation") + private Kernel executeOpenCL(ExecutionSettings _settings) throws AparapiException { - if (logger.isLoggable(Level.FINE)) { - logger.fine("maxComputeUnits=" + this.getMaxComputeUnitsJNI(jniContextHandle)); - logger.fine("maxWorkGroupSize=" + this.getMaxWorkGroupSizeJNI(jniContextHandle)); - logger.fine("maxWorkItemDimensions=" + this.getMaxWorkItemDimensionsJNI(jniContextHandle)); - logger.fine("maxWorkItemSize(0)=" + getMaxWorkItemSizeJNI(jniContextHandle, 0)); - if (_range.getDims() > 1) { - logger.fine("maxWorkItemSize(1)=" + getMaxWorkItemSizeJNI(jniContextHandle, 1)); - if (_range.getDims() > 2) { - logger.fine("maxWorkItemSize(2)=" + getMaxWorkItemSizeJNI(jniContextHandle, 2)); - } - } - } - */ // Read the array refs after kernel may have changed them // We need to do this as input to computing the localSize assert args != null : "args should not be null"; final boolean needSync = updateKernelArrayRefs(); if (needSync && logger.isLoggable(Level.FINE)) { - logger.fine("Need to resync arrays on " + describeKernelClass()); + logger.fine("Need to resync arrays on " + kernel); } // native side will reallocate array buffers if necessary - if (runKernelJNI(jniContextHandle, _range, needSync, _passes, inBufferRemote, outBufferRemote) != 0) { - logger.warning("### " + describeKernelClass() + " - CL exec seems to have failed. Trying to revert to Java ###"); - kernel.setFallbackExecutionMode(); - return execute(_entrypointName, _range, _passes); + int returnValue = runKernelJNI(jniContextHandle, _settings.range, needSync, _settings.passes, inBufferRemote, outBufferRemote); + if (returnValue != 0) { + String reason = "OpenCL execution seems to have failed (runKernelJNI returned " + returnValue + ")"; + return fallBackToNextDevice(_settings, new AparapiException(reason)); } if (usesOopConversion == true) { @@ -1028,343 +987,488 @@ public class KernelRunner extends KernelRunnerJNI{ } if (logger.isLoggable(Level.FINE)) { - logger.fine("executeOpenCL completed. " + _range); + logger.fine("executeOpenCL completed. " + _settings.range); } return kernel; } - public synchronized Kernel execute(Kernel.Entry entry, final Range _range, final int _passes) { - System.out.println("execute(Kernel.Entry, size) not implemented"); - return (kernel); - } - - synchronized private Kernel fallBackAndExecute(String _entrypointName, final Range _range, final int _passes) { + @SuppressWarnings("deprecation") + synchronized private Kernel fallBackByExecutionMode(ExecutionSettings _settings) { isFallBack = true; if (kernel.hasNextExecutionMode()) { kernel.tryNextExecutionMode(); + if (logger.isLoggable(Level.WARNING)) { + logger.warning("Trying next execution mode " + kernel.getExecutionMode()); + } } else { kernel.setFallbackExecutionMode(); } + recreateRange(_settings); + return executeInternal(_settings); + } + + private void recreateRange(ExecutionSettings _settings) { + if (_settings.range.isLocalIsDerived() && !_settings.legacyExecutionMode) { + Device device = kernel.getTargetDevice(); + Range result; + switch (_settings.range.getDims()) { + case 1: { + result = Range.create(device, _settings.range.getGlobalSize_0()); + break; + } + case 2: { + result = Range.create2D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1()); + break; + } + case 3: { + result = Range.create3D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1(), _settings.range.getGlobalSize_2()); + break; + } + default: { + throw new AssertionError("Range.getDims() = " + _settings.range.getDims()); + } + } + _settings.range = result; + } + } + + private Kernel fallBackToNextDevice(ExecutionSettings _settings, String _reason) { + return fallBackToNextDevice(_settings, new AparapiException(_reason)); + } - return execute(_entrypointName, _range, _passes); + @SuppressWarnings("deprecation") + synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception) { + return fallBackToNextDevice(_settings, _exception, false); } - synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes, - Exception _exception) { - if (logger.isLoggable(Level.WARNING)) { - logger.warning("Reverting to the next execution mode for " + describeKernelClass() + ": " + _exception.getMessage()); - _exception.printStackTrace(); + @SuppressWarnings("deprecation") + synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception, boolean _silently) { + isFallBack = true; + _settings.profile.onEvent(ProfilingEvent.EXECUTED); + if (_settings.legacyExecutionMode) { + if (!_silently && logger.isLoggable(Level.WARNING)) { + logger.warning("Execution mode " + kernel.getExecutionMode() + " failed for " + kernel + ": " + _exception.getMessage()); + _exception.printStackTrace(); + } + return fallBackByExecutionMode(_settings); + } else { + KernelPreferences preferences = KernelManager.instance().getPreferences(kernel); + if (!_silently && logger.isLoggable(Level.WARNING)) { + logger.warning("Device failed for " + kernel + ": " + _exception.getMessage()); + } + + preferences.markPreferredDeviceFailed(); + +// Device nextDevice = preferences.getPreferredDevice(kernel); +// +// if (nextDevice == null) { +// if (!_silently && logger.isLoggable(Level.SEVERE)) { +// logger.severe("No Devices left to try, giving up"); +// } +// throw new RuntimeException(_exception); +// } + if (!_silently && logger.isLoggable(Level.WARNING)) { + _exception.printStackTrace(); + logger.warning("Trying next device: " + describeDevice()); + } } - return fallBackAndExecute(_entrypointName, _range, _passes); + + recreateRange(_settings); + return executeInternal(_settings); } - private String describeKernelClass() { - return kernel.getClass().getName(); + private String describeDevice() { + Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel); + return (device == null) ? "<default fallback>" : device.getShortDescription(); } - synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes, String _excuse) { - logger.warning("Reverting to the next execution mode for " + describeKernelClass() + ": " + _excuse); - return fallBackAndExecute(_entrypointName, _range, _passes); + @Override + public String toString() { + return "KernelRunner{" + kernel + "}"; } - public synchronized Kernel execute(String _entrypointName, final Range _range, final int _passes) { - clearCancelMultiPass(); + @SuppressWarnings("deprecation") + public synchronized Kernel execute(String _entrypoint, final Range _range, final int _passes) { executing = true; - try { - long executeStartTime = System.currentTimeMillis(); + clearCancelMultiPass(); + KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass()); + KernelPreferences preferences = KernelManager.instance().getPreferences(kernel); + boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO; - if (_range == null) { - throw new IllegalStateException("range can't be null"); + ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode); + try { + // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates + // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously. + synchronized (preferences) { + return executeInternal(settings); } + } finally { + executing = false; + clearCancelMultiPass(); + } + } - /* for backward compatibility reasons we still honor execution mode */ - if (kernel.getExecutionMode().isOpenCL()) { - // System.out.println("OpenCL"); + @SuppressWarnings("deprecation") + private synchronized Kernel executeInternal(ExecutionSettings _settings) { - // See if user supplied a Device - Device device = _range.getDevice(); + if (_settings.range == null) { + throw new IllegalStateException("range can't be null"); + } - if ((device == null) || (device instanceof OpenCLDevice)) { - if ((entryPoint == null) || (isFallBack)) { - if (entryPoint == null) { - try { - final ClassModel classModel = ClassModel.createClassModel(kernel.getClass()); - entryPoint = classModel.getEntrypoint(_entrypointName, kernel); - } catch (final Exception exception) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, exception); - } - } + EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode(); - if ((entryPoint != null) && !entryPoint.shouldFallback()) { - synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68 - if (device != null && !(device instanceof OpenCLDevice)) { - throw new IllegalStateException("range's device is not suitable for OpenCL "); - } + if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) { + fallBackToNextDevice(_settings, "OpenCL was requested but Device supplied was not an OpenCLDevice"); + } - OpenCLDevice openCLDevice = (OpenCLDevice) device; // still might be null! + Device device = _settings.range.getDevice(); + boolean userSpecifiedDevice = true; + if (device == null) { + userSpecifiedDevice = false; + if (!_settings.legacyExecutionMode) { + device = _settings.preferences.getPreferredDevice(kernel); + if (device == null) { + // the default fallback when KernelPreferences has run out of options is JTP + device = JavaDevice.THREAD_POOL; + } + } else { + if (requestedExecutionMode == EXECUTION_MODE.JTP) { + device = JavaDevice.THREAD_POOL; + } else if (requestedExecutionMode == EXECUTION_MODE.SEQ) { + device = JavaDevice.SEQUENTIAL; + } + } + } else { + boolean compatible = isDeviceCompatible(device); + if (!compatible) { + throw new AssertionError("user supplied Device incompatible with current EXECUTION_MODE or getTargetDevice(); device = " + + device.getShortDescription() + "; kernel = " + kernel); + } + } - int jniFlags = 0; - if (openCLDevice == null) { - if (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU)) { - // Get the best GPU - openCLDevice = (OpenCLDevice) OpenCLDevice.bestGPU(); - jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. - if (openCLDevice == null) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, "GPU request can't be honored"); - } - } else if (kernel.getExecutionMode().equals(EXECUTION_MODE.ACC)) { - // Get the best ACC - openCLDevice = (OpenCLDevice) OpenCLDevice.bestACC(); - jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now. - if (openCLDevice == null) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, "ACC request can't be honored"); - } - } else { - // We fetch the first CPU device - openCLDevice = (OpenCLDevice) OpenCLDevice.firstCPU(); - if (openCLDevice == null) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, - "CPU request can't be honored not CPU device"); - } - } - } else { // openCLDevice == null - if (openCLDevice.getType() == Device.TYPE.GPU) { - jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. - } else if (openCLDevice.getType() == Device.TYPE.ACC) { - jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now. - } - } + try { + OpenCLDevice openCLDevice = device instanceof OpenCLDevice ? (OpenCLDevice) device : null; - // jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0); - // jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0); - // jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0); - // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0); - // jniFlags |= (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0); - // Init the device to check capabilities before emitting the - // code that requires the capabilities. + int jniFlags = 0; + if (_settings.legacyExecutionMode && device != null && !(device instanceof OpenCLDevice)) { + hashCode(); + } + // for legacy reasons use old logic where Kernel.EXECUTION_MODE is not AUTO + if (_settings.legacyExecutionMode && !userSpecifiedDevice && requestedExecutionMode.isOpenCL()) { + if (requestedExecutionMode.equals(EXECUTION_MODE.GPU)) { + // Get the best GPU + openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestGPU(); + jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. + if (openCLDevice == null) { + return fallBackToNextDevice(_settings, "GPU request can't be honored, no GPU device"); + } + } else if (requestedExecutionMode.equals(EXECUTION_MODE.ACC)) { + // Get the best ACC + openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestACC(); + jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now. + if (openCLDevice == null) { + return fallBackToNextDevice(_settings, "ACC request can't be honored, no ACC device"); + } + } else { + // We fetch the first CPU device + openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.firstDevice(Device.TYPE.CPU); + if (openCLDevice == null) { + return fallBackToNextDevice(_settings, "CPU request can't be honored, no CPU device"); + } + } + } else { + if (device.getType() == Device.TYPE.GPU) { + jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. + } else if (device.getType() == Device.TYPE.ACC) { + jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now. + } + } + if (device == null && openCLDevice != null) { + device = openCLDevice; + } + assert device != null : "No device available"; + _settings.profile.onStart(device); + /* for backward compatibility reasons we still honor execution mode */ + boolean isOpenCl = requestedExecutionMode.isOpenCL() || device instanceof OpenCLDevice; + if (isOpenCl) { + if ((entryPoint == null) || (isFallBack)) { + if (entryPoint == null) { + try { + final ClassModel classModel = ClassModel.createClassModel(kernel.getClass()); + entryPoint = classModel.getEntrypoint(_settings.entrypoint, kernel); + _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT); + } catch (final Exception exception) { + _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT); + return fallBackToNextDevice(_settings, exception); + } + } - // synchronized(Kernel.class){ - jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here - } // end of synchronized! issue 68 + if ((entryPoint != null)) { + synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68 - if (jniContextHandle == 0) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, "initJNI failed to return a valid handle"); - } + // jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0); + // jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0); + // jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0); + // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0); + // jniFlags |= (kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0); + // Init the device to check capabilities before emitting the + // code that requires the capabilities. - final String extensions = getExtensionsJNI(jniContextHandle); - capabilitiesSet = new HashSet<String>(); + // synchronized(Kernel.class){ + jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here + } // end of synchronized! issue 68 - final StringTokenizer strTok = new StringTokenizer(extensions); - while (strTok.hasMoreTokens()) { - capabilitiesSet.add(strTok.nextToken()); - } + if (jniContextHandle == 0) { + return fallBackToNextDevice(_settings, "initJNI failed to return a valid handle"); + } - if (logger.isLoggable(Level.FINE)) { - logger.fine("Capabilities initialized to :" + capabilitiesSet.toString()); - } + final String extensions = getExtensionsJNI(jniContextHandle); + capabilitiesSet = new HashSet<String>(); - if (entryPoint.requiresDoublePragma() && !hasFP64Support()) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, "FP64 required but not supported"); - } + final StringTokenizer strTok = new StringTokenizer(extensions); + while (strTok.hasMoreTokens()) { + capabilitiesSet.add(strTok.nextToken()); + } - if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, - "Byte addressable stores required but not supported"); - } + if (logger.isLoggable(Level.FINE)) { + logger.fine("Capabilities initialized to :" + capabilitiesSet.toString()); + } - final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport() - && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport() - && hasLocalInt32ExtendedAtomicsSupport(); + if (entryPoint.requiresDoublePragma() && !hasFP64Support()) { + return fallBackToNextDevice(_settings, "FP64 required but not supported"); + } - if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) { + if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) { + return fallBackToNextDevice(_settings, "Byte addressable stores required but not supported"); + } - return warnFallBackAndExecute(_entrypointName, _range, _passes, "32 bit Atomics required but not supported"); - } + final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport() + && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport() + && hasLocalInt32ExtendedAtomicsSupport(); - String openCL = null; - try { - openCL = KernelWriter.writeToString(entryPoint); - } catch (final CodeGenException codeGenException) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, codeGenException); - } + if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) { - if (Config.enableShowGeneratedOpenCL) { - System.out.println(openCL); - } + return fallBackToNextDevice(_settings, "32 bit Atomics required but not supported"); + } - if (logger.isLoggable(Level.INFO)) { - logger.info(openCL); + String openCL; + synchronized (openCLCache) { + openCL = openCLCache.get(kernel.getClass()); + if (openCL == null) { + try { + openCL = KernelWriter.writeToString(entryPoint); + if (logger.isLoggable(Level.INFO)) { + logger.info(openCL); + } + else if (Config.enableShowGeneratedOpenCL) { + System.out.println(openCL); + } + _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED); + openCLCache.put(kernel.getClass(), openCL); + } + catch (final CodeGenException codeGenException) { + openCLCache.put(kernel.getClass(), CODE_GEN_ERROR_MARKER); + _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED); + return fallBackToNextDevice(_settings, codeGenException); + } } - - // Send the string to OpenCL to compile it - if (buildProgramJNI(jniContextHandle, openCL) == 0) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, "OpenCL compile failed"); + else { + if (openCL.equals(CODE_GEN_ERROR_MARKER)) { + _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED); + boolean silently = true; // since we must have already reported the CodeGenException + return fallBackToNextDevice(_settings, null, silently); + } } + } - args = new KernelArg[entryPoint.getReferencedFields().size()]; - int i = 0; + // Send the string to OpenCL to compile it + long handle = buildProgramJNI(jniContextHandle, openCL); + _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED); + if (handle == 0) { + return fallBackToNextDevice(_settings, "OpenCL compile failed"); + } - for (final Field field : entryPoint.getReferencedFields()) { - try { - field.setAccessible(true); - args[i] = new KernelArg(); - args[i].setName(field.getName()); - args[i].setField(field); - if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) { - args[i].setType(args[i].getType() | ARG_STATIC); + args = new KernelArg[entryPoint.getReferencedFields().size()]; + int i = 0; + + for (final Field field : entryPoint.getReferencedFields()) { + try { + field.setAccessible(true); + args[i] = new KernelArg(); + args[i].setName(field.getName()); + args[i].setField(field); + if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) { + args[i].setType(args[i].getType() | ARG_STATIC); + } + + final Class<?> type = field.getType(); + if (type.isArray()) { + + if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) { + args[i].setType(args[i].getType() | ARG_LOCAL); + } else if ((field.getAnnotation(Constant.class) != null) + || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) { + args[i].setType(args[i].getType() | ARG_CONSTANT); + } else { + args[i].setType(args[i].getType() | ARG_GLOBAL); } + if (isExplicit()) { + args[i].setType(args[i].getType() | ARG_EXPLICIT); + } + // for now, treat all write arrays as read-write, see bugzilla issue 4859 + // we might come up with a better solution later + args[i].setType(args[i].getType() + | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0)); + args[i].setType(args[i].getType() + | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0)); + // args[i].type |= ARG_GLOBAL; + + if (type.getName().startsWith("[L")) { + args[i].setType(args[i].getType() + | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER)); - final Class<?> type = field.getType(); - if (type.isArray()) { + if (logger.isLoggable(Level.FINE)) { + logger.fine("tagging " + args[i].getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)"); + } + } else if (type.getName().startsWith("[[")) { - if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) { - args[i].setType(args[i].getType() | ARG_LOCAL); - } else if ((field.getAnnotation(Constant.class) != null) - || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) { - args[i].setType(args[i].getType() | ARG_CONSTANT); - } else { - args[i].setType(args[i].getType() | ARG_GLOBAL); + try { + setMultiArrayType(args[i], type); + } catch (AparapiException e) { + return fallBackToNextDevice(_settings, "failed to set kernel arguement " + + args[i].getName() + ". Aparapi only supports 2D and 3D arrays."); } - if (isExplicit()) { - args[i].setType(args[i].getType() | ARG_EXPLICIT); + } else { + + args[i].setArray(null); // will get updated in updateKernelArrayRefs + args[i].setType(args[i].getType() | ARG_ARRAY); + + args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0)); + + // arrays whose length is used will have an int arg holding + // the length as a kernel param + if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) { + args[i].setType(args[i].getType() | ARG_ARRAYLENGTH); } - // for now, treat all write arrays as read-write, see bugzilla issue 4859 - // we might come up with a better solution later - args[i].setType(args[i].getType() - | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0)); - args[i].setType(args[i].getType() - | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0)); - // args[i].type |= ARG_GLOBAL; if (type.getName().startsWith("[L")) { - args[i].setType(args[i].getType() - | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER)); - + args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)); if (logger.isLoggable(Level.FINE)) { - logger.fine("tagging " + args[i].getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)"); - } - } else if (type.getName().startsWith("[[")) { - - try { - setMultiArrayType(args[i], type); - } catch (AparapiException e) { - return warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to set kernel arguement " - + args[i].getName() + ". Aparapi only supports 2D and 3D arrays."); - } - } else { - - args[i].setArray(null); // will get updated in updateKernelArrayRefs - args[i].setType(args[i].getType() | ARG_ARRAY); - - args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0)); - - // arrays whose length is used will have an int arg holding - // the length as a kernel param - if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) { - args[i].setType(args[i].getType() | ARG_ARRAYLENGTH); - } - - if (type.getName().startsWith("[L")) { - args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)); - if (logger.isLoggable(Level.FINE)) { - logger.fine("tagging " + args[i].getName() - + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)"); - } + logger.fine("tagging " + args[i].getName() + + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)"); } } - } else if (type.isAssignableFrom(float.class)) { - args[i].setType(args[i].getType() | ARG_PRIMITIVE); - args[i].setType(args[i].getType() | ARG_FLOAT); - } else if (type.isAssignableFrom(int.class)) { - args[i].setType(args[i].getType() | ARG_PRIMITIVE); - args[i].setType(args[i].getType() | ARG_INT); - } else if (type.isAssignableFrom(double.class)) { - args[i].setType(args[i].getType() | ARG_PRIMITIVE); - args[i].setType(args[i].getType() | ARG_DOUBLE); - } else if (type.isAssignableFrom(long.class)) { - args[i].setType(args[i].getType() | ARG_PRIMITIVE); - args[i].setType(args[i].getType() | ARG_LONG); - } else if (type.isAssignableFrom(boolean.class)) { - args[i].setType(args[i].getType() | ARG_PRIMITIVE); - args[i].setType(args[i].getType() | ARG_BOOLEAN); - } else if (type.isAssignableFrom(byte.class)) { - args[i].setType(args[i].getType() | ARG_PRIMITIVE); - args[i].setType(args[i].getType() | ARG_BYTE); - } else if (type.isAssignableFrom(char.class)) { - args[i].setType(args[i].getType() | ARG_PRIMITIVE); - args[i].setType(args[i].getType() | ARG_CHAR); - } else if (type.isAssignableFrom(short.class)) { - args[i].setType(args[i].getType() | ARG_PRIMITIVE); - args[i].setType(args[i].getType() | ARG_SHORT); } - // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type ); - } catch (final IllegalArgumentException e) { - e.printStackTrace(); + } else if (type.isAssignableFrom(float.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_FLOAT); + } else if (type.isAssignableFrom(int.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_INT); + } else if (type.isAssignableFrom(double.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_DOUBLE); + } else if (type.isAssignableFrom(long.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_LONG); + } else if (type.isAssignableFrom(boolean.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_BOOLEAN); + } else if (type.isAssignableFrom(byte.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_BYTE); + } else if (type.isAssignableFrom(char.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_CHAR); + } else if (type.isAssignableFrom(short.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_SHORT); } + // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type ); + } catch (final IllegalArgumentException e) { + e.printStackTrace(); + } - args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType())); - - if (logger.isLoggable(Level.FINE)) { - logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType()) - + ", primitiveSize=" + args[i].getPrimitiveSize()); - } + args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType())); - i++; + if (logger.isLoggable(Level.FINE)) { + logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType()) + + ", primitiveSize=" + args[i].getPrimitiveSize()); } - // at this point, i = the actual used number of arguments - // (private buffers do not get treated as arguments) - - argc = i; + i++; + } - setArgsJNI(jniContextHandle, args, argc); + // at this point, i = the actual used number of arguments + // (private buffers do not get treated as arguments) - conversionTime = System.currentTimeMillis() - executeStartTime; + argc = i; - try { - executeOpenCL(_entrypointName, _range, _passes); - isFallBack = false; - } catch (final AparapiException e) { - warnFallBackAndExecute(_entrypointName, _range, _passes, e); - } - } else { // (entryPoint != null) && !entryPoint.shouldFallback() - warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to locate entrypoint"); - } - } else { // (entryPoint == null) || (isFallBack) + setArgsJNI(jniContextHandle, args, argc); + _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE); try { - executeOpenCL(_entrypointName, _range, _passes); + executeOpenCL(_settings); isFallBack = false; } catch (final AparapiException e) { - warnFallBackAndExecute(_entrypointName, _range, _passes, e); + fallBackToNextDevice(_settings, e); } + } else { // (entryPoint != null) && !entryPoint.shouldFallback() + fallBackToNextDevice(_settings, "failed to locate entrypoint"); + } + } else { // (entryPoint == null) || (isFallBack) + try { + executeOpenCL(_settings); + isFallBack = false; + } catch (final AparapiException e) { + fallBackToNextDevice(_settings, e); } - } else { // (device == null) || (device instanceof OpenCLDevice) - warnFallBackAndExecute(_entrypointName, _range, _passes, - "OpenCL was requested but Device supplied was not an OpenCLDevice"); } - } else { // kernel.getExecutionMode().isOpenCL() - executeJava(_range, _passes); + } else { // isOpenCL + if (!(device instanceof JavaDevice)) { + fallBackToNextDevice(_settings, "Non-OpenCL Kernel.EXECUTION_MODE requested but device is not a JavaDevice "); + } + executeJava(_settings, (JavaDevice) device); } if (Config.enableExecutionModeReporting) { - System.out.println(describeKernelClass() + ":" + kernel.getExecutionMode()); + System.out.println("execution complete: " + kernel); } - executionTime = System.currentTimeMillis() - executeStartTime; - accumulatedExecutionTime += executionTime; - return kernel; - } finally { - executing = false; - clearCancelMultiPass(); + } + finally { + _settings.profile.onEvent(ProfilingEvent.EXECUTED); + } + } + + @SuppressWarnings("deprecation") + private boolean isDeviceCompatible(Device device) { + Kernel.EXECUTION_MODE mode = kernel.getExecutionMode(); + if (mode != Kernel.EXECUTION_MODE.AUTO) { + switch (device.getType()) { + case GPU: + return mode == Kernel.EXECUTION_MODE.GPU; + case CPU: + return mode == Kernel.EXECUTION_MODE.CPU; + case JTP: + return mode == Kernel.EXECUTION_MODE.JTP; + case SEQ: + return mode == Kernel.EXECUTION_MODE.SEQ; + case ACC: + return mode == Kernel.EXECUTION_MODE.ACC; + default: + return false; + } + } else { + return (device == kernel.getTargetDevice()); } } @@ -1394,14 +1498,11 @@ public class KernelRunner extends KernelRunnerJNI{ if (!executing) { return PASS_ID_COMPLETED_EXECUTION; } - switch (kernel.getExecutionMode()) { - case NONE: - return PASS_ID_COMPLETED_EXECUTION; - case JTP: // fallthrough - case SEQ: - return getCurrentPassLocal(); - default: - return getCurrentPassRemote(); + + if (kernel.isRunningCL()) { + return getCurrentPassRemote(); + } else { + return getCurrentPassLocal(); } } @@ -1520,17 +1621,14 @@ public class KernelRunner extends KernelRunnerJNI{ * @see Kernel#get(boolean[] arr) */ public void get(Object array) { - if (explicit - && ((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU) - || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) { - // Only makes sense when we are using OpenCL + if (explicit && (kernel.isRunningCL())) { + // Only makes sense when we are using OpenCL getJNI(jniContextHandle, array); } } public List<ProfileInfo> getProfileInfo() { - if (((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel - .getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) { + if (explicit && (kernel.isRunningCL())) { // Only makes sense when we are using OpenCL return (getProfileInfoJNI(jniContextHandle)); } else { @@ -1554,9 +1652,7 @@ public class KernelRunner extends KernelRunnerJNI{ */ public void put(Object array) { - if (explicit - && ((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU) - || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) { + if (explicit && (kernel.isRunningCL())) { // Only makes sense when we are using OpenCL puts.add(array); } @@ -1572,33 +1668,33 @@ public class KernelRunner extends KernelRunnerJNI{ return (explicit); } - /** - * Determine the time taken to convert bytecode to OpenCL for first Kernel.execute(range) call. - * - * @return The time spent preparing the kernel for execution using GPU - * - */ - public long getConversionTime() { - return conversionTime; - } - - /** - * Determine the execution time of the previous Kernel.execute(range) call. - * - * @return The time spent executing the kernel (ms) - * - */ - public long getExecutionTime() { - return executionTime; - } + private static class ExecutionSettings { + final KernelPreferences preferences; + final KernelProfile profile; + final String entrypoint; + Range range; + final int passes; + final boolean legacyExecutionMode; + + private ExecutionSettings(KernelPreferences preferences, KernelProfile profile, String entrypoint, Range range, int passes, boolean legacyExecutionMode) { + this.preferences = preferences; + this.profile = profile; + this.entrypoint = entrypoint; + this.range = range; + this.passes = passes; + this.legacyExecutionMode = legacyExecutionMode; + } - /** - * Determine the accumulated execution time of all previous Kernel.execute(range) calls. - * - * @return The accumulated time spent executing this kernel (ms) - * - */ - public long getAccumulatedExecutionTime() { - return accumulatedExecutionTime; + @Override + public String toString() { + return "ExecutionSettings{" + + "preferences=" + preferences + + ", profile=" + profile + + ", entrypoint='" + entrypoint + '\'' + + ", range=" + range + + ", passes=" + passes + + ", legacyExecutionMode=" + legacyExecutionMode + + '}'; + } } } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java new file mode 100644 index 0000000000000000000000000000000000000000..77959b65cc75208325ffbafd5e954c6499aa07cd --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java @@ -0,0 +1,8 @@ +package com.amd.aparapi.internal.kernel; + +/** + * Created by Barney on 02/09/2015. + */ +public enum ProfilingEvent { + START, CLASS_MODEL_BUILT, OPENCL_GENERATED, OPENCL_COMPILED, PREPARE_EXECUTE, EXECUTED +} diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java index e4728c5e892f769305fcec0f8b29878a1aecbd41..132f4f21ae49d9371e19914bfc03805f5aceb880 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java @@ -45,6 +45,7 @@ import com.amd.aparapi.internal.model.ValueCache.ThrowingValueComputer; import com.amd.aparapi.internal.model.ClassModel.AttributePool.*; import com.amd.aparapi.internal.model.ClassModel.ConstantPool.*; import com.amd.aparapi.internal.reader.*; +import com.amd.aparapi.internal.util.*; import java.io.*; import java.lang.reflect.*; @@ -2629,7 +2630,7 @@ public class ClassModel{ methods.add(method); } - attributePool = new AttributePool(byteReader, getClassWeAreModelling().getSimpleName()); + attributePool = new AttributePool(byteReader, Reflection.getSimpleName(getClassWeAreModelling())); } public int getMagic() { diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java index 7ae155efa905a1dca6cd88f39931977a6ea9317a..974dac64adfec1c2ba8ca681c3576e6ccad28fda 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java @@ -62,8 +62,6 @@ public class Entrypoint implements Cloneable { private Object kernelInstance = null; - private final boolean fallback = false; - private final Set<String> referencedFieldNames = new LinkedHashSet<String>(); private final Set<String> arrayFieldAssignments = new LinkedHashSet<String>(); @@ -474,7 +472,7 @@ public class Entrypoint implements Cloneable { // methodMap now contains a list of method called by run itself(). // Walk the whole graph of called methods and add them to the methodMap - while (!fallback && discovered) { + while (discovered) { discovered = false; for (final MethodModel mm : new ArrayList<MethodModel>(methodMap.values())) { for (final MethodCall methodCall : mm.getMethodCalls()) { @@ -506,295 +504,288 @@ public class Entrypoint implements Cloneable { methodModel.checkForRecursion(new HashSet<MethodModel>()); - if (logger.isLoggable(Level.FINE)) { - logger.fine("fallback=" + fallback); - } - - if (!fallback) { - calledMethods.addAll(methodMap.values()); - Collections.reverse(calledMethods); - final List<MethodModel> methods = new ArrayList<MethodModel>(calledMethods); + calledMethods.addAll(methodMap.values()); + Collections.reverse(calledMethods); + final List<MethodModel> methods = new ArrayList<MethodModel>(calledMethods); - // add method to the calledMethods so we can include in this list - methods.add(methodModel); - final Set<String> fieldAssignments = new HashSet<String>(); + // add method to the calledMethods so we can include in this list + methods.add(methodModel); + final Set<String> fieldAssignments = new HashSet<String>(); - final Set<String> fieldAccesses = new HashSet<String>(); + final Set<String> fieldAccesses = new HashSet<String>(); - for (final MethodModel methodModel : methods) { - - // Record which pragmas we need to enable - if (methodModel.requiresDoublePragma()) { - usesDoubles = true; - if (logger.isLoggable(Level.FINE)) { - logger.fine("Enabling doubles on " + methodModel.getName()); - } + for (final MethodModel methodModel : methods) { + // Record which pragmas we need to enable + if (methodModel.requiresDoublePragma()) { + usesDoubles = true; + if (logger.isLoggable(Level.FINE)) { + logger.fine("Enabling doubles on " + methodModel.getName()); } - if (methodModel.requiresByteAddressableStorePragma()) { - usesByteWrites = true; - if (logger.isLoggable(Level.FINE)) { - logger.fine("Enabling byte addressable on " + methodModel.getName()); - } + + } + if (methodModel.requiresByteAddressableStorePragma()) { + usesByteWrites = true; + if (logger.isLoggable(Level.FINE)) { + logger.fine("Enabling byte addressable on " + methodModel.getName()); } + } - for (Instruction instruction = methodModel.getPCHead(); instruction != null; instruction = instruction.getNextPC()) { + for (Instruction instruction = methodModel.getPCHead(); instruction != null; instruction = instruction.getNextPC()) { - if (instruction instanceof AssignToArrayElement) { - final AssignToArrayElement assignment = (AssignToArrayElement) instruction; + if (instruction instanceof AssignToArrayElement) { + final AssignToArrayElement assignment = (AssignToArrayElement) instruction; - final Instruction arrayRef = assignment.getArrayRef(); - // AccessField here allows instance and static array refs - if (arrayRef instanceof I_GETFIELD) { - final I_GETFIELD getField = (I_GETFIELD) arrayRef; - final FieldEntry field = getField.getConstantPoolFieldEntry(); - final String assignedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); - arrayFieldAssignments.add(assignedArrayFieldName); - referencedFieldNames.add(assignedArrayFieldName); + final Instruction arrayRef = assignment.getArrayRef(); + // AccessField here allows instance and static array refs + if (arrayRef instanceof I_GETFIELD) { + final I_GETFIELD getField = (I_GETFIELD) arrayRef; + final FieldEntry field = getField.getConstantPoolFieldEntry(); + final String assignedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); + arrayFieldAssignments.add(assignedArrayFieldName); + referencedFieldNames.add(assignedArrayFieldName); - } - } else if (instruction instanceof AccessArrayElement) { - final AccessArrayElement access = (AccessArrayElement) instruction; - - final Instruction arrayRef = access.getArrayRef(); - // AccessField here allows instance and static array refs - if (arrayRef instanceof I_GETFIELD) { - final I_GETFIELD getField = (I_GETFIELD) arrayRef; - final FieldEntry field = getField.getConstantPoolFieldEntry(); - final String accessedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); - arrayFieldAccesses.add(accessedArrayFieldName); - referencedFieldNames.add(accessedArrayFieldName); + } + } else if (instruction instanceof AccessArrayElement) { + final AccessArrayElement access = (AccessArrayElement) instruction; + + final Instruction arrayRef = access.getArrayRef(); + // AccessField here allows instance and static array refs + if (arrayRef instanceof I_GETFIELD) { + final I_GETFIELD getField = (I_GETFIELD) arrayRef; + final FieldEntry field = getField.getConstantPoolFieldEntry(); + final String accessedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); + arrayFieldAccesses.add(accessedArrayFieldName); + referencedFieldNames.add(accessedArrayFieldName); - } - } else if (instruction instanceof I_ARRAYLENGTH) { - Instruction child = instruction.getFirstChild(); - while(child instanceof I_AALOAD) { - child = child.getFirstChild(); - } - if (!(child instanceof AccessField)) { - throw new ClassParseException(ClassParseException.TYPE.LOCALARRAYLENGTHACCESS); - } - final AccessField childField = (AccessField) child; - final String arrayName = childField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); - arrayFieldArrayLengthUsed.add(arrayName); - if (logger.isLoggable(Level.FINE)) { - logger.fine("Noted arraylength in " + methodModel.getName() + " on " + arrayName); - } - } else if (instruction instanceof AccessField) { - final AccessField access = (AccessField) instruction; - final FieldEntry field = access.getConstantPoolFieldEntry(); - final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); - fieldAccesses.add(accessedFieldName); - referencedFieldNames.add(accessedFieldName); - - final String signature = field.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8(); - if (logger.isLoggable(Level.FINE)) { - logger.fine("AccessField field type= " + signature + " in " + methodModel.getName()); - } + } + } else if (instruction instanceof I_ARRAYLENGTH) { + Instruction child = instruction.getFirstChild(); + while(child instanceof I_AALOAD) { + child = child.getFirstChild(); + } + if (!(child instanceof AccessField)) { + throw new ClassParseException(ClassParseException.TYPE.LOCALARRAYLENGTHACCESS); + } + final AccessField childField = (AccessField) child; + final String arrayName = childField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); + arrayFieldArrayLengthUsed.add(arrayName); + if (logger.isLoggable(Level.FINE)) { + logger.fine("Noted arraylength in " + methodModel.getName() + " on " + arrayName); + } + } else if (instruction instanceof AccessField) { + final AccessField access = (AccessField) instruction; + final FieldEntry field = access.getConstantPoolFieldEntry(); + final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); + fieldAccesses.add(accessedFieldName); + referencedFieldNames.add(accessedFieldName); + + final String signature = field.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8(); + if (logger.isLoggable(Level.FINE)) { + logger.fine("AccessField field type= " + signature + " in " + methodModel.getName()); + } - // Add the class model for the referenced obj array - if (signature.startsWith("[L")) { - // Turn [Lcom/amd/javalabs/opencl/demo/DummyOOA; into com.amd.javalabs.opencl.demo.DummyOOA for example - final String className = (signature.substring(2, signature.length() - 1)).replace('/', '.'); - final ClassModel arrayFieldModel = getOrUpdateAllClassAccesses(className); - if (arrayFieldModel != null) { - final Class<?> memberClass = arrayFieldModel.getClassWeAreModelling(); - final int modifiers = memberClass.getModifiers(); - if (!Modifier.isFinal(modifiers)) { - throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTNONFINAL); - } + // Add the class model for the referenced obj array + if (signature.startsWith("[L")) { + // Turn [Lcom/amd/javalabs/opencl/demo/DummyOOA; into com.amd.javalabs.opencl.demo.DummyOOA for example + final String className = (signature.substring(2, signature.length() - 1)).replace('/', '.'); + final ClassModel arrayFieldModel = getOrUpdateAllClassAccesses(className); + if (arrayFieldModel != null) { + final Class<?> memberClass = arrayFieldModel.getClassWeAreModelling(); + final int modifiers = memberClass.getModifiers(); + if (!Modifier.isFinal(modifiers)) { + throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTNONFINAL); + } - final ClassModel refModel = objectArrayFieldsClasses.get(className); - if (refModel == null) { - - // Verify no other member with common parent - for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) { - ClassModel superModel = memberObjClass; - while (superModel != null) { - if (superModel.isSuperClass(memberClass)) { - throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTFIELDNAMECONFLICT); - } - superModel = superModel.getSuperClazz(); + final ClassModel refModel = objectArrayFieldsClasses.get(className); + if (refModel == null) { + + // Verify no other member with common parent + for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) { + ClassModel superModel = memberObjClass; + while (superModel != null) { + if (superModel.isSuperClass(memberClass)) { + throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTFIELDNAMECONFLICT); } + superModel = superModel.getSuperClazz(); } + } - objectArrayFieldsClasses.put(className, arrayFieldModel); - if (logger.isLoggable(Level.FINE)) { - logger.fine("adding class to objectArrayFields: " + className); - } + objectArrayFieldsClasses.put(className, arrayFieldModel); + if (logger.isLoggable(Level.FINE)) { + logger.fine("adding class to objectArrayFields: " + className); } } - } else { - final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.'); - // Look for object data member access - if (!className.equals(getClassModel().getClassWeAreModelling().getName()) - && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), accessedFieldName) == null)) { - updateObjectMemberFieldAccesses(className, field); - } } - - } else if (instruction instanceof AssignToField) { - final AssignToField assignment = (AssignToField) instruction; - final FieldEntry field = assignment.getConstantPoolFieldEntry(); - final String assignedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); - fieldAssignments.add(assignedFieldName); - referencedFieldNames.add(assignedFieldName); - + } else { final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.'); // Look for object data member access if (!className.equals(getClassModel().getClassWeAreModelling().getName()) - && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), assignedFieldName) == null)) { + && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), accessedFieldName) == null)) { updateObjectMemberFieldAccesses(className, field); - } else { + } + } - if ((!Config.enablePUTFIELD) && methodModel.methodUsesPutfield() && !methodModel.isSetter()) { - throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTONLYSUPPORTSSIMPLEPUTFIELD); - } + } else if (instruction instanceof AssignToField) { + final AssignToField assignment = (AssignToField) instruction; + final FieldEntry field = assignment.getConstantPoolFieldEntry(); + final String assignedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); + fieldAssignments.add(assignedFieldName); + referencedFieldNames.add(assignedFieldName); + + final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.'); + // Look for object data member access + if (!className.equals(getClassModel().getClassWeAreModelling().getName()) + && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), assignedFieldName) == null)) { + updateObjectMemberFieldAccesses(className, field); + } else { + if ((!Config.enablePUTFIELD) && methodModel.methodUsesPutfield() && !methodModel.isSetter()) { + throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTONLYSUPPORTSSIMPLEPUTFIELD); } } - else if (instruction instanceof I_INVOKEVIRTUAL) { - final I_INVOKEVIRTUAL invokeInstruction = (I_INVOKEVIRTUAL) instruction; - MethodModel invokedMethod = invokeInstruction.getMethod(); - FieldEntry getterField = getSimpleGetterField(invokedMethod); - if (getterField != null) { - referencedFieldNames.add(getterField.getNameAndTypeEntry().getNameUTF8Entry().getUTF8()); - } - else { - final MethodEntry methodEntry = invokeInstruction.getConstantPoolMethodEntry(); - if (Kernel.isMappedMethod(methodEntry)) { //only do this for intrinsics - if (Kernel.usesAtomic32(methodEntry)) { - setRequiresAtomics32Pragma(true); - } + } + else if (instruction instanceof I_INVOKEVIRTUAL) { + final I_INVOKEVIRTUAL invokeInstruction = (I_INVOKEVIRTUAL) instruction; + MethodModel invokedMethod = invokeInstruction.getMethod(); + FieldEntry getterField = getSimpleGetterField(invokedMethod); + if (getterField != null) { + referencedFieldNames.add(getterField.getNameAndTypeEntry().getNameUTF8Entry().getUTF8()); + } + else { + final MethodEntry methodEntry = invokeInstruction.getConstantPoolMethodEntry(); + if (Kernel.isMappedMethod(methodEntry)) { //only do this for intrinsics - final Arg methodArgs[] = methodEntry.getArgs(); - if ((methodArgs.length > 0) && methodArgs[0].isArray()) { //currently array arg can only take slot 0 - final Instruction arrInstruction = invokeInstruction.getArg(0); - if (arrInstruction instanceof AccessField) { - final AccessField access = (AccessField) arrInstruction; - final FieldEntry field = access.getConstantPoolFieldEntry(); - final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); - arrayFieldAssignments.add(accessedFieldName); - referencedFieldNames.add(accessedFieldName); - } - else { - throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTSETTERARRAY); - } - } + if (Kernel.usesAtomic32(methodEntry)) { + setRequiresAtomics32Pragma(true); } + final Arg methodArgs[] = methodEntry.getArgs(); + if ((methodArgs.length > 0) && methodArgs[0].isArray()) { //currently array arg can only take slot 0 + final Instruction arrInstruction = invokeInstruction.getArg(0); + if (arrInstruction instanceof AccessField) { + final AccessField access = (AccessField) arrInstruction; + final FieldEntry field = access.getConstantPoolFieldEntry(); + final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); + arrayFieldAssignments.add(accessedFieldName); + referencedFieldNames.add(accessedFieldName); + } + else { + throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTSETTERARRAY); + } + } } + } } } + } - for (final String referencedFieldName : referencedFieldNames) { + for (final String referencedFieldName : referencedFieldNames) { - try { - final Class<?> clazz = classModel.getClassWeAreModelling(); - final Field field = getFieldFromClassHierarchy(clazz, referencedFieldName); - if (field != null) { - referencedFields.add(field); - final ClassModelField ff = classModel.getField(referencedFieldName); - assert ff != null : "ff should not be null for " + clazz.getName() + "." + referencedFieldName; - referencedClassModelFields.add(ff); - } - } catch (final SecurityException e) { - e.printStackTrace(); + try { + final Class<?> clazz = classModel.getClassWeAreModelling(); + final Field field = getFieldFromClassHierarchy(clazz, referencedFieldName); + if (field != null) { + referencedFields.add(field); + final ClassModelField ff = classModel.getField(referencedFieldName); + assert ff != null : "ff should not be null for " + clazz.getName() + "." + referencedFieldName; + referencedClassModelFields.add(ff); } + } catch (final SecurityException e) { + e.printStackTrace(); } + } - // Build data needed for oop form transforms if necessary - if (!objectArrayFieldsClasses.keySet().isEmpty()) { + // Build data needed for oop form transforms if necessary + if (!objectArrayFieldsClasses.keySet().isEmpty()) { - for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) { + for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) { - // At this point we have already done the field override safety check, so - // add all the superclass fields into the kernel member class to be - // sorted by size and emitted into the struct - ClassModel superModel = memberObjClass.getSuperClazz(); - while (superModel != null) { - if (logger.isLoggable(Level.FINEST)) { - logger.finest("adding = " + superModel.getClassWeAreModelling().getName() + " fields into " - + memberObjClass.getClassWeAreModelling().getName()); - } - memberObjClass.getStructMembers().addAll(superModel.getStructMembers()); - superModel = superModel.getSuperClazz(); + // At this point we have already done the field override safety check, so + // add all the superclass fields into the kernel member class to be + // sorted by size and emitted into the struct + ClassModel superModel = memberObjClass.getSuperClazz(); + while (superModel != null) { + if (logger.isLoggable(Level.FINEST)) { + logger.finest("adding = " + superModel.getClassWeAreModelling().getName() + " fields into " + + memberObjClass.getClassWeAreModelling().getName()); } + memberObjClass.getStructMembers().addAll(superModel.getStructMembers()); + superModel = superModel.getSuperClazz(); } + } - // Sort fields of each class biggest->smallest - final Comparator<FieldEntry> fieldSizeComparator = new Comparator<FieldEntry>(){ - @Override public int compare(FieldEntry aa, FieldEntry bb) { - final String aType = aa.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8(); - final String bType = bb.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8(); - - // Booleans get converted down to bytes - final int aSize = InstructionSet.TypeSpec.valueOf(aType.equals("Z") ? "B" : aType).getSize(); - final int bSize = InstructionSet.TypeSpec.valueOf(bType.equals("Z") ? "B" : bType).getSize(); + // Sort fields of each class biggest->smallest + final Comparator<FieldEntry> fieldSizeComparator = new Comparator<FieldEntry>(){ + @Override public int compare(FieldEntry aa, FieldEntry bb) { + final String aType = aa.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8(); + final String bType = bb.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8(); - if (logger.isLoggable(Level.FINEST)) { - logger.finest("aType= " + aType + " aSize= " + aSize + " . . bType= " + bType + " bSize= " + bSize); - } + // Booleans get converted down to bytes + final int aSize = InstructionSet.TypeSpec.valueOf(aType.equals("Z") ? "B" : aType).getSize(); + final int bSize = InstructionSet.TypeSpec.valueOf(bType.equals("Z") ? "B" : bType).getSize(); - // Note this is sorting in reverse order so the biggest is first - if (aSize > bSize) { - return -1; - } else if (aSize == bSize) { - return 0; - } else { - return 1; - } + if (logger.isLoggable(Level.FINEST)) { + logger.finest("aType= " + aType + " aSize= " + aSize + " . . bType= " + bType + " bSize= " + bSize); } - }; - - for (final ClassModel c : objectArrayFieldsClasses.values()) { - final ArrayList<FieldEntry> fields = c.getStructMembers(); - if (fields.size() > 0) { - Collections.sort(fields, fieldSizeComparator); - - // Now compute the total size for the struct - int totalSize = 0; - int alignTo = 0; - - for (final FieldEntry f : fields) { - // Record field offset for use while copying - // Get field we will copy out of the kernel member object - final Field rfield = getFieldFromClassHierarchy(c.getClassWeAreModelling(), f.getNameAndTypeEntry() - .getNameUTF8Entry().getUTF8()); - - c.getStructMemberOffsets().add(UnsafeWrapper.objectFieldOffset(rfield)); - - final String fType = f.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8(); - //c.getStructMemberTypes().add(TypeSpec.valueOf(fType.equals("Z") ? "B" : fType)); - c.getStructMemberTypes().add(TypeSpec.valueOf(fType)); - final int fSize = TypeSpec.valueOf(fType.equals("Z") ? "B" : fType).getSize(); - if (fSize > alignTo) { - alignTo = fSize; - } - totalSize += fSize; - if (logger.isLoggable(Level.FINEST)) { - logger.finest("Field = " + f.getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " size=" + fSize - + " totalSize=" + totalSize); - } + // Note this is sorting in reverse order so the biggest is first + if (aSize > bSize) { + return -1; + } else if (aSize == bSize) { + return 0; + } else { + return 1; + } + } + }; + + for (final ClassModel c : objectArrayFieldsClasses.values()) { + final ArrayList<FieldEntry> fields = c.getStructMembers(); + if (fields.size() > 0) { + Collections.sort(fields, fieldSizeComparator); + + // Now compute the total size for the struct + int totalSize = 0; + int alignTo = 0; + + for (final FieldEntry f : fields) { + // Record field offset for use while copying + // Get field we will copy out of the kernel member object + final Field rfield = getFieldFromClassHierarchy(c.getClassWeAreModelling(), f.getNameAndTypeEntry() + .getNameUTF8Entry().getUTF8()); + + c.getStructMemberOffsets().add(UnsafeWrapper.objectFieldOffset(rfield)); + + final String fType = f.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8(); + //c.getStructMemberTypes().add(TypeSpec.valueOf(fType.equals("Z") ? "B" : fType)); + c.getStructMemberTypes().add(TypeSpec.valueOf(fType)); + final int fSize = TypeSpec.valueOf(fType.equals("Z") ? "B" : fType).getSize(); + if (fSize > alignTo) { + alignTo = fSize; } - // compute total size for OpenCL buffer - int totalStructSize = 0; - if ((totalSize % alignTo) == 0) { - totalStructSize = totalSize; - } else { - // Pad up if necessary - totalStructSize = ((totalSize / alignTo) + 1) * alignTo; + totalSize += fSize; + if (logger.isLoggable(Level.FINEST)) { + logger.finest("Field = " + f.getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " size=" + fSize + + " totalSize=" + totalSize); } - c.setTotalStructSize(totalStructSize); } + + // compute total size for OpenCL buffer + int totalStructSize = 0; + if ((totalSize % alignTo) == 0) { + totalStructSize = totalSize; + } else { + // Pad up if necessary + totalStructSize = ((totalSize / alignTo) + 1) * alignTo; + } + c.setTotalStructSize(totalStructSize); } } - } } @@ -807,10 +798,6 @@ public class Entrypoint implements Cloneable { return method.getAccessorVariableFieldEntry(); } - public boolean shouldFallback() { - return (fallback); - } - public List<ClassModel.ClassModelField> getReferencedClassModelFields() { return (referencedClassModelFields); } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java index 12b52360ef683e9bf74d2bf9a5f2a2b73d2092c0..1f8321336f6999aec5fc7540f65d32ab07cef2bd 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java @@ -1,10 +1,9 @@ package com.amd.aparapi.internal.opencl; -import java.util.ArrayList; -import java.util.List; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.jni.*; -import com.amd.aparapi.device.OpenCLDevice; -import com.amd.aparapi.internal.jni.OpenCLJNI; +import java.util.*; public class OpenCLPlatform extends OpenCLJNI{ @@ -18,6 +17,8 @@ public class OpenCLPlatform extends OpenCLJNI{ private final List<OpenCLDevice> devices = new ArrayList<OpenCLDevice>(); + private static List<OpenCLPlatform> platforms; + /** * Default constructor */ @@ -51,11 +52,14 @@ public class OpenCLPlatform extends OpenCLJNI{ } public List<OpenCLPlatform> getOpenCLPlatforms() { - if (OpenCLLoader.isOpenCLAvailable()) { - return (getPlatforms()); - } else { - return (new ArrayList<OpenCLPlatform>()); + if (platforms == null) { + if (OpenCLLoader.isOpenCLAvailable()) { + platforms = getPlatforms(); + } else { + return (Collections.EMPTY_LIST); + } } + return platforms; } public String getName() { diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java new file mode 100644 index 0000000000000000000000000000000000000000..3f2ad65d866931cb04f9739020717f47ba15fc4f --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java @@ -0,0 +1,18 @@ +package com.amd.aparapi.internal.util; + +/** + * Created by Barney on 03/09/2015. + */ +public class Reflection { + + /** Avoids getting dumb empty names for anonymous inners. */ + public static String getSimpleName(Class<?> klass) { + String simpleName = klass.getSimpleName(); + if (simpleName.isEmpty()) { + String fullName = klass.getName(); + int index = fullName.lastIndexOf('.'); + simpleName = (index < 0) ? fullName : fullName.substring(index + 1); + } + return simpleName; + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java new file mode 100644 index 0000000000000000000000000000000000000000..2e52a6e1113f61d475f46a942800d15a1088d052 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java @@ -0,0 +1,82 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.*; +import com.amd.aparapi.internal.kernel.*; + +import java.util.*; + +/** + * Tests device selection via {@link com.amd.aparapi.internal.kernel.KernelManager}. + */ +public class ConfigurationDemo { + public static void main(String[] ignored) { + System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true"); + + StringBuilder report; + + List<Integer> tests = Arrays.asList(0, 1, 2, 3); + int reps = 300; + for (int rep = 0; rep < reps; ++rep) { + runTests(rep == 0, tests); + + if (rep % 100 == 99 || rep == 0) { + report = new StringBuilder("rep = " + rep + "\n"); + KernelManager.instance().reportDeviceUsage(report, true); + System.out.println(report); + } + } + } + + private static void runTests(boolean verbose, List<Integer> testIndicesToRun) { + final int globalSize = 1; + Kernel kernel; + if (testIndicesToRun.contains(0)) { + if (verbose) { + System.out.println(); + System.out.println("Testing default KernelPreferences with kernel which cannot be run in OpenCL, with fallback algorithm"); + System.out.println(); + } + kernel = new KernelWithAlternateFallbackAlgorithm(); + kernel.execute(globalSize); + kernel.dispose(); + } + + if (testIndicesToRun.contains(1)) { + if (verbose) { + System.out.println(); + System.out.println("Testing default KernelPreferences with kernel which cannot be run in OpenCL, without fallback algorithm"); + System.out.println(); + } + kernel = new KernelWithoutAlternateFallbackAlgorithm(); + kernel.execute(globalSize); + kernel.dispose(); + } + + if (testIndicesToRun.contains(2)) { + if (verbose) { + System.out.println(); + System.out.println("Retesting previous case, should jump straight to regular java implementation without warnings"); + System.out.println(); + } + kernel = new KernelWithoutAlternateFallbackAlgorithm(); + kernel.execute(globalSize); + kernel.dispose(); + } + + if (testIndicesToRun.contains(3)) { + if (verbose) { + System.out.println(); + System.out.println("Testing default KernelPreferences with kernel which should be run in OpenCL"); + System.out.println(); + } + KernelOkayInOpenCL clKernel = new KernelOkayInOpenCL(); + kernel = clKernel; + kernel.execute(clKernel.inChars.length); + String result = new String(clKernel.outChars); + if (verbose) { + System.out.println("kernel output: " + result); + } + kernel.dispose(); + } + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java new file mode 100644 index 0000000000000000000000000000000000000000..476737d42fe97c6d3bd6eee1fc5f78fc105ebaf5 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java @@ -0,0 +1,42 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; + +import java.util.*; + +/** + * Created by Barney on 31/08/2015. + */ +public class CustomConfigurationDemo { + + public static void main(String[] ignored) { + System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true"); + KernelManager manager = new KernelManager() { + @Override + protected List<Device.TYPE> getPreferredDeviceTypes() { + return Arrays.asList(Device.TYPE.CPU, Device.TYPE.ALT, Device.TYPE.JTP); + } + }; + KernelManager.setKernelManager(manager); + + System.out.println("\nTesting custom KernelPreferences with kernel, preferences choose CPU"); + KernelOkayInOpenCL kernel = new KernelOkayInOpenCL(); + kernel.execute(kernel.inChars.length); + System.out.println(kernel.outChars); + + System.out.println("\nTesting custom KernelPreferences with kernel, preferences specify CPU but kernel vetos CPU"); + kernel = new KernelOkayInOpenCL() { + @Override + public boolean isAllowDevice(Device _device) { + return _device.getType() != Device.TYPE.CPU; + } + }; + kernel.execute(kernel.inChars.length); + System.out.println(kernel.outChars); + + StringBuilder report = new StringBuilder("\n"); + KernelManager.instance().reportDeviceUsage(report, true); + System.out.println(report); + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java new file mode 100644 index 0000000000000000000000000000000000000000..6ed54e5b7ef47eab954c42a3e9df5a795de42566 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java @@ -0,0 +1,21 @@ +package com.amd.aparapi.sample.configuration; + +/** + * Created by Barney on 24/08/2015. + */ +public class KernelOkayInOpenCL extends com.amd.aparapi.Kernel { + char[] inChars = "KernelOkayInOpenCL".toCharArray(); + char[] outChars = new char[inChars.length]; + + @Override + public void run() { + int index = getGlobalId(); + oops(); + outChars[index] = inChars[index]; + } + + @NoCL + private void oops() { + System.out.println("Oops, running in kernel in Java"); + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java new file mode 100644 index 0000000000000000000000000000000000000000..670e6a669193d05d017648f04515a439d9f0b8d1 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java @@ -0,0 +1,24 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.*; + +/** + * Kernel which will always fail to run on an OpenCLDevice but has an alternative fallback algorithm. + */ +public class KernelWithAlternateFallbackAlgorithm extends Kernel { + @Override + public void run() { + // deliberately, will fail to generate OpenCL as println is unsupported + System.out.println("Running in Java (regular algorithm)"); + } + + @Override + public boolean hasFallbackAlgorithm() { + return true; + } + + @Override + public void executeFallbackAlgorithm(Range _range, int _passes) { + System.out.println("Running in Java (alternate non-parallel algorithm)"); + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java new file mode 100644 index 0000000000000000000000000000000000000000..1096a092e38c2c696c153d969eb31b54c4d8c844 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java @@ -0,0 +1,14 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.*; + +/** + * Kernel which will always fail to run on an OpenCLDevice but has an alternative fallback algorithm. + */ +public class KernelWithoutAlternateFallbackAlgorithm extends Kernel { + @Override + public void run() { + // deliberately, will fail to generate OpenCL as println is unsupported + System.out.println("Running in Java (regular algorithm)"); + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java new file mode 100644 index 0000000000000000000000000000000000000000..db4149a139f3b50d49de50b94c48ceafe98ec4e5 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java @@ -0,0 +1,26 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.*; +import com.amd.aparapi.internal.kernel.*; + +/** + * Tests device selection when circumventing the {@link com.amd.aparapi.internal.kernel.KernelManager} by using the legacy mechanism + * (setExecutionMode, etc.). + */ +public class LegacyConfigurationDemo { + + @SuppressWarnings("deprecation") + public static void main(String[] ignored) { + System.setProperty("com.amd.aparapi.executionMode", "GPU,CPU,SEQ"); + System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true"); + + KernelWithAlternateFallbackAlgorithm kernel = new KernelWithAlternateFallbackAlgorithm(); + kernel.setExecutionMode(Kernel.EXECUTION_MODE.GPU); + int globalRange = 1; + kernel.execute(globalRange); + + StringBuilder report = new StringBuilder("\n"); + KernelManager.instance().reportDeviceUsage(report, true); + System.out.println(report); + } +} diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java index 57f96b060a8c0993309d714e7e804fffa704bcc8..4b916b252e1ba399bea1c57f5860c2f4d6d9ea68 100644 --- a/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java +++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java @@ -38,13 +38,13 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.convolution; -import java.io.File; +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; +import com.amd.aparapi.opencl.*; +import com.amd.aparapi.opencl.OpenCL.*; -import com.amd.aparapi.Range; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.device.OpenCLDevice; -import com.amd.aparapi.opencl.OpenCL; -import com.amd.aparapi.opencl.OpenCL.Resource; +import java.io.*; public class ConvolutionOpenCL{ @@ -61,7 +61,7 @@ public class ConvolutionOpenCL{ public static void main(final String[] _args) { final File file = new File(_args.length == 1 ? _args[0] : "testcard.jpg"); - final OpenCLDevice openclDevice = (OpenCLDevice) Device.best(); + final OpenCLDevice openclDevice = (OpenCLDevice) KernelManager.instance().bestDevice(); final Convolution convolution = openclDevice.bind(Convolution.class); final float convMatrix3x3[] = new float[] { diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java index 9284fa503455f429b2f10d9cfdaa519e7f183650..7c575c7a2c8200b95e8755e49fc8d15992ac1ea4 100644 --- a/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java +++ b/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java @@ -1,12 +1,12 @@ package com.amd.aparapi.sample.extension; -import java.util.Arrays; +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; +import com.amd.aparapi.opencl.*; +import com.amd.aparapi.opencl.OpenCL.*; -import com.amd.aparapi.Range; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.device.OpenCLDevice; -import com.amd.aparapi.opencl.OpenCL; -import com.amd.aparapi.opencl.OpenCL.Resource; +import java.util.*; public class FFTExample{ @@ -98,7 +98,7 @@ public class FFTExample{ final float imaginary[] = new float[LEN]; final float referenceReal[] = Arrays.copyOf(real, real.length); final float referenceImaginary[] = Arrays.copyOf(imaginary, imaginary.length); - final OpenCLDevice device = (OpenCLDevice) Device.best(); + final OpenCLDevice device = (OpenCLDevice) KernelManager.instance().getDefaultPreferences().getPreferredDevice(null); final FFT fft = device.bind(FFT.class); for (int i = 0; i < LEN; i++) { initial[i] = real[i] = referenceReal[i] = (float) (Math.random() * 256); diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java b/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java index 54b06c3b0a98eae4b8685b3762959f44d9c9e232..e260d5e825f5a287f987bcc5ac063ed68a8a0041 100644 --- a/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java +++ b/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java @@ -41,7 +41,6 @@ public class Histogram{ System.out.println("binResult size=" + binResult.length); final int[] histo = new int[BIN_SIZE]; final int[] refHisto = new int[BIN_SIZE]; - final Device device = Device.firstGPU(); final Kernel k = new Kernel(){ @Override public void run() { @@ -52,6 +51,7 @@ public class Histogram{ } }; + final Device device = k.getTargetDevice(); final Range range2 = device.createRange(BIN_SIZE); k.execute(range2); diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java b/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java index 1ff76e9f5593423d2ab04c4dc73617937efc47f5..a0f74813706604358021cdc53d02663332d63a67 100644 --- a/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java +++ b/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java @@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension; import com.amd.aparapi.Range; import com.amd.aparapi.device.Device; import com.amd.aparapi.device.OpenCLDevice; +import com.amd.aparapi.internal.kernel.*; import com.amd.aparapi.opencl.OpenCL; public class HistogramIdeal{ @@ -40,7 +41,7 @@ public class HistogramIdeal{ System.out.println("binResult size=" + binResult.length); final int[] histo = new int[BIN_SIZE]; final int[] refHisto = new int[BIN_SIZE]; - final Device device = Device.best(); + final Device device = KernelManager.instance().bestDevice(); if (device != null) { System.out.println(((OpenCLDevice) device).getOpenCLPlatform().getName()); diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java index ba2d20ac2d6b496bb7b766ac0edecd7a1d781c3d..85ac9cda4614810b3936c568ec47d39213e06ba6 100644 --- a/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java +++ b/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java @@ -38,37 +38,17 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.extension; -import java.awt.BorderLayout; -import java.awt.Dimension; -import java.awt.FlowLayout; -import java.awt.Graphics; -import java.awt.Point; -import java.awt.event.ItemEvent; -import java.awt.event.ItemListener; -import java.awt.event.MouseAdapter; -import java.awt.event.MouseEvent; -import java.awt.event.WindowAdapter; -import java.awt.event.WindowEvent; -import java.awt.image.BufferedImage; -import java.awt.image.DataBufferInt; -import java.util.concurrent.BrokenBarrierException; -import java.util.concurrent.CyclicBarrier; - -import javax.swing.JComboBox; -import javax.swing.JComponent; -import javax.swing.JFrame; -import javax.swing.JLabel; -import javax.swing.JPanel; -import javax.swing.JTextField; - -import com.amd.aparapi.Range; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.device.OpenCLDevice; -import com.amd.aparapi.internal.opencl.OpenCLPlatform; -import com.amd.aparapi.internal.util.OpenCLUtil; -import com.amd.aparapi.opencl.OpenCL; -import com.amd.aparapi.opencl.OpenCL.Resource; -import com.amd.aparapi.opencl.OpenCLAdapter; +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; +import com.amd.aparapi.opencl.*; +import com.amd.aparapi.opencl.OpenCL.*; + +import javax.swing.*; +import java.awt.*; +import java.awt.event.*; +import java.awt.image.*; +import java.util.concurrent.*; /** * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. @@ -418,7 +398,7 @@ public class MandelExample{ float offsetx = .0f; float offsety = .0f; - Device device = Device.best(); + Device device = KernelManager.instance().bestDevice(); if (device instanceof OpenCLDevice) { final OpenCLDevice openclDevice = (OpenCLDevice) device; diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java b/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java index 1b9c993b7ed53ef6ad1db3f224aa5132b2540902..89faa7f2f6ef027a2d5163f9e7f139cb0080a43e 100644 --- a/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java +++ b/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java @@ -38,29 +38,16 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.extension; -import java.awt.BorderLayout; -import java.awt.Dimension; -import java.awt.FlowLayout; -import java.awt.Graphics; -import java.awt.Point; -import java.awt.event.MouseAdapter; -import java.awt.event.MouseEvent; -import java.awt.event.WindowAdapter; -import java.awt.event.WindowEvent; -import java.awt.image.BufferedImage; -import java.awt.image.DataBufferInt; - -import javax.swing.JComponent; -import javax.swing.JFrame; -import javax.swing.JLabel; -import javax.swing.JPanel; -import javax.swing.JTextField; - -import com.amd.aparapi.Range; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.device.OpenCLDevice; -import com.amd.aparapi.opencl.OpenCL; -import com.amd.aparapi.opencl.OpenCL.Resource; +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; +import com.amd.aparapi.opencl.*; +import com.amd.aparapi.opencl.OpenCL.*; + +import javax.swing.*; +import java.awt.*; +import java.awt.event.*; +import java.awt.image.*; /** * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. @@ -155,7 +142,7 @@ public class MandelSimple{ float offsetx = .0f; float offsety = .0f; - final Device device = Device.best(); + final Device device = KernelManager.instance().bestDevice(); if (device instanceof OpenCLDevice) { final OpenCLDevice openclDevice = (OpenCLDevice) device; diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java b/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java index 7bc046767b738a2fcbd496cc8155454863e31355..0ea3043e19eaf2fc0203beaffdb6709e7e1a2230 100644 --- a/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java +++ b/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java @@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension; import com.amd.aparapi.Range; import com.amd.aparapi.device.Device; import com.amd.aparapi.device.OpenCLDevice; +import com.amd.aparapi.internal.kernel.*; import com.amd.aparapi.opencl.OpenCL; import com.amd.aparapi.opencl.OpenCL.Resource; @@ -26,7 +27,7 @@ public class Pow4Example{ final float[] squares = new float[size]; final Range range = Range.create(size); - final Device device = Device.best(); + final Device device = KernelManager.instance().bestDevice(); if (device instanceof OpenCLDevice) { final OpenCLDevice openclDevice = (OpenCLDevice) device; diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java index e2b2aa4814ec8baf558f6a24fa92e8c54cc3c9cf..58f01c0b8789a51ae886b73e608dc2f3bb98b25d 100644 --- a/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java +++ b/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java @@ -4,6 +4,7 @@ import com.amd.aparapi.ProfileInfo; import com.amd.aparapi.Range; import com.amd.aparapi.device.Device; import com.amd.aparapi.device.OpenCLDevice; +import com.amd.aparapi.internal.kernel.*; import com.amd.aparapi.opencl.OpenCL; import com.amd.aparapi.opencl.OpenCL.Resource; import com.amd.aparapi.opencl.OpenCL.Source; @@ -54,7 +55,7 @@ public class SquareExample{ final float[] quads = new float[size]; final Range range = Range.create(size); - final Device device = Device.best(); + final Device device = KernelManager.instance().bestDevice(); if (device instanceof OpenCLDevice) { final OpenCLDevice openclDevice = (OpenCLDevice) device; diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java index 381b52faa4b694b5ae86a1618b8b2382c3a602cd..d5fe0bf9244580e431ae6d72ea6ae372a0998beb 100644 --- a/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java +++ b/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java @@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension; import com.amd.aparapi.Range; import com.amd.aparapi.device.Device; import com.amd.aparapi.device.OpenCLDevice; +import com.amd.aparapi.internal.kernel.*; import com.amd.aparapi.opencl.OpenCL; public class SwapExample{ @@ -29,7 +30,7 @@ public class SwapExample{ final float[] rhs = new float[size]; final Range range = Range.create(size); - final Device device = Device.best(); + final Device device = KernelManager.instance().bestDevice(); if (device instanceof OpenCLDevice) { final OpenCLDevice openclDevice = (OpenCLDevice) device; diff --git a/samples/info/src/com/amd/aparapi/sample/info/Main.java b/samples/info/src/com/amd/aparapi/sample/info/Main.java index fcff248937d1be7a55fed94e9bf5a047ca6ece9e..8397715d404927671ebb496cbaeb8cd925ab6022 100644 --- a/samples/info/src/com/amd/aparapi/sample/info/Main.java +++ b/samples/info/src/com/amd/aparapi/sample/info/Main.java @@ -38,11 +38,11 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.info; -import java.util.List; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; +import com.amd.aparapi.internal.opencl.*; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.device.OpenCLDevice; -import com.amd.aparapi.internal.opencl.OpenCLPlatform; +import java.util.*; public class Main{ public static void main(String[] _args) { @@ -73,90 +73,13 @@ public class Main{ platformc++; } - Device bestDevice = OpenCLDevice.best(); - if (bestDevice == null) { - System.out.println("OpenCLDevice.best() returned null!"); - } else { - System.out.println("OpenCLDevice.best() returned { "); - System.out.println(" Type : " + bestDevice.getType()); - System.out.println(" GlobalMemSize : " + ((OpenCLDevice) bestDevice).getGlobalMemSize()); - System.out.println(" LocalMemSize : " + ((OpenCLDevice) bestDevice).getLocalMemSize()); - System.out.println(" MaxComputeUnits : " + ((OpenCLDevice) bestDevice).getMaxComputeUnits()); - System.out.println(" MaxWorkGroupSizes : " + ((OpenCLDevice) bestDevice).getMaxWorkGroupSize()); - System.out.println(" MaxWorkItemDimensions : " + ((OpenCLDevice) bestDevice).getMaxWorkItemDimensions()); - System.out.println("}"); - } - - Device firstCPU = OpenCLDevice.firstCPU(); - if (firstCPU == null) { - System.out.println("OpenCLDevice.firstCPU() returned null!"); - } else { - System.out.println("OpenCLDevice.firstCPU() returned { "); - System.out.println(" Type : " + firstCPU.getType()); - System.out.println(" GlobalMemSize : " + ((OpenCLDevice) firstCPU).getGlobalMemSize()); - System.out.println(" LocalMemSize : " + ((OpenCLDevice) firstCPU).getLocalMemSize()); - System.out.println(" MaxComputeUnits : " + ((OpenCLDevice) firstCPU).getMaxComputeUnits()); - System.out.println(" MaxWorkGroupSizes : " + ((OpenCLDevice) firstCPU).getMaxWorkGroupSize()); - System.out.println(" MaxWorkItemDimensions : " + ((OpenCLDevice) firstCPU).getMaxWorkItemDimensions()); - System.out.println("}"); - } - - Device firstGPU = OpenCLDevice.firstGPU(); - if (firstGPU == null) { - System.out.println("OpenCLDevice.firstGPU() returned null!"); - } else { - System.out.println("OpenCLDevice.firstGPU() returned { "); - System.out.println(" Type : " + firstGPU.getType()); - System.out.println(" GlobalMemSize : " + ((OpenCLDevice) firstGPU).getGlobalMemSize()); - System.out.println(" LocalMemSize : " + ((OpenCLDevice) firstGPU).getLocalMemSize()); - System.out.println(" MaxComputeUnits : " + ((OpenCLDevice) firstGPU).getMaxComputeUnits()); - System.out.println(" MaxWorkGroupSizes : " + ((OpenCLDevice) firstGPU).getMaxWorkGroupSize()); - System.out.println(" MaxWorkItemDimensions : " + ((OpenCLDevice) firstGPU).getMaxWorkItemDimensions()); - System.out.println("}"); - } + KernelPreferences preferences = KernelManager.instance().getDefaultPreferences(); + System.out.println("\nDevices in preferred order:\n"); - Device bestGPU = OpenCLDevice.bestGPU(); - if (bestGPU == null) { - System.out.println("OpenCLDevice.bestGPU() returned null!"); - } else { - System.out.println("OpenCLDevice.bestGPU() returned { "); - System.out.println(" Type : " + bestGPU.getType()); - System.out.println(" GlobalMemSize : " + ((OpenCLDevice) bestGPU).getGlobalMemSize()); - System.out.println(" LocalMemSize : " + ((OpenCLDevice) bestGPU).getLocalMemSize()); - System.out.println(" MaxComputeUnits : " + ((OpenCLDevice) bestGPU).getMaxComputeUnits()); - System.out.println(" MaxWorkGroupSizes : " + ((OpenCLDevice) bestGPU).getMaxWorkGroupSize()); - System.out.println(" MaxWorkItemDimensions : " + ((OpenCLDevice) bestGPU).getMaxWorkItemDimensions()); - System.out.println("}"); - } - - Device firstACC = OpenCLDevice.firstACC(); - if (firstACC == null) { - System.out.println("OpenCLDevice.firstACC() returned null!"); - } else { - System.out.println("OpenCLDevice.firstACC() returned { "); - System.out.println(" Type : " + firstACC.getType()); - System.out.println(" GlobalMemSize : " + ((OpenCLDevice) firstACC).getGlobalMemSize()); - System.out.println(" LocalMemSize : " + ((OpenCLDevice) firstACC).getLocalMemSize()); - System.out.println(" MaxComputeUnits : " + ((OpenCLDevice) firstACC).getMaxComputeUnits()); - System.out.println(" MaxWorkGroupSizes : " + ((OpenCLDevice) firstACC).getMaxWorkGroupSize()); - System.out.println(" MaxWorkItemDimensions : " + ((OpenCLDevice) firstACC).getMaxWorkItemDimensions()); - System.out.println("}"); + for (Device device : preferences.getPreferredDevices(null)) { + System.out.println(device); + System.out.println(); } - - Device bestACC = OpenCLDevice.bestACC(); - if (bestACC == null) { - System.out.println("OpenCLDevice.bestACC() returned null!"); - } else { - System.out.println("OpenCLDevice.bestACC() returned { "); - System.out.println(" Type : " + bestACC.getType()); - System.out.println(" GlobalMemSize : " + ((OpenCLDevice) bestACC).getGlobalMemSize()); - System.out.println(" LocalMemSize : " + ((OpenCLDevice) bestACC).getLocalMemSize()); - System.out.println(" MaxComputeUnits : " + ((OpenCLDevice) bestACC).getMaxComputeUnits()); - System.out.println(" MaxWorkGroupSizes : " + ((OpenCLDevice) bestACC).getMaxWorkGroupSize()); - System.out.println(" MaxWorkItemDimensions : " + ((OpenCLDevice) bestACC).getMaxWorkItemDimensions()); - System.out.println("}"); - } - } } diff --git a/samples/life/src/com/amd/aparapi/sample/life/Main.java b/samples/life/src/com/amd/aparapi/sample/life/Main.java index 963cceb9ed0750585f0891c483d1bead7c3b4dd8..e51ca5fafa4431d417c07206fff95782c4d168c0 100644 --- a/samples/life/src/com/amd/aparapi/sample/life/Main.java +++ b/samples/life/src/com/amd/aparapi/sample/life/Main.java @@ -38,26 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.life; -import java.awt.BorderLayout; -import java.awt.Dimension; -import java.awt.FlowLayout; -import java.awt.Graphics; -import java.awt.event.ActionEvent; -import java.awt.event.ActionListener; -import java.awt.image.BufferedImage; -import java.awt.image.DataBufferInt; -import java.util.List; - -import javax.swing.JButton; -import javax.swing.JComponent; -import javax.swing.JFrame; -import javax.swing.JLabel; -import javax.swing.JPanel; -import javax.swing.WindowConstants; - import com.amd.aparapi.Kernel; -import com.amd.aparapi.ProfileInfo; -import com.amd.aparapi.Range; +import com.amd.aparapi.*; + +import javax.swing.*; +import java.awt.*; +import java.awt.event.*; +import java.awt.image.*; +import java.util.List; /** * An example Aparapi application which demonstrates Conways 'Game Of Life'. @@ -239,7 +227,7 @@ public class Main{ } }); controlPanel.add(startButton); - controlPanel.add(new JLabel(lifeKernel.getExecutionMode().toString())); + controlPanel.add(new JLabel(lifeKernel.getTargetDevice().getShortDescription())); controlPanel.add(new JLabel(" Generations/Second=")); final JLabel generationsPerSecond = new JLabel("0.00"); diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java index 8ad4a7a766537877946a11e6ce71e2038431c2be..13de958505466f8a17ce3af2cbe84f3481d130f8 100644 --- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java +++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java @@ -38,24 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.mandel; -import java.awt.Color; -import java.awt.Dimension; -import java.awt.Graphics; -import java.awt.Point; -import java.awt.event.MouseAdapter; -import java.awt.event.MouseEvent; -import java.awt.event.WindowAdapter; -import java.awt.event.WindowEvent; -import java.awt.image.BufferedImage; -import java.awt.image.DataBufferInt; -import java.util.List; - -import javax.swing.JComponent; -import javax.swing.JFrame; - import com.amd.aparapi.Kernel; -import com.amd.aparapi.ProfileInfo; -import com.amd.aparapi.Range; +import com.amd.aparapi.*; + +import javax.swing.*; +import java.awt.*; +import java.awt.event.*; +import java.awt.image.*; +import java.util.List; /** * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. @@ -107,7 +97,6 @@ public class Main{ * @param _width Mandelbrot image width * @param _height Mandelbrot image height * @param _rgb Mandelbrot image RGB buffer - * @param _pallette Mandelbrot image palette */ public MandelKernel(int _width, int _height, int[] _rgb) { //Initialize palette values @@ -229,8 +218,7 @@ public class Main{ System.arraycopy(rgb, 0, imageRgb, 0, rgb.length); viewer.repaint(); - // Report target execution mode: GPU or JTP (Java Thread Pool). - System.out.println("Execution mode=" + kernel.getExecutionMode()); + System.out.println("device=" + kernel.getTargetDevice()); // Window listener to dispose Kernel resources on user exit. frame.addWindowListener(new WindowAdapter(){ diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java index 65c62320965962f53fcb5aa98bae0254a6aca5ca..8a1b7faa68eceb14aeae40c133bf2d6f57303bd0 100644 --- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java +++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java @@ -38,24 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.mandel; -import java.awt.Color; -import java.awt.Dimension; -import java.awt.Graphics; -import java.awt.Point; -import java.awt.event.MouseAdapter; -import java.awt.event.MouseEvent; -import java.awt.event.WindowAdapter; -import java.awt.event.WindowEvent; -import java.awt.image.BufferedImage; -import java.awt.image.DataBufferInt; -import java.util.List; - -import javax.swing.JComponent; -import javax.swing.JFrame; - import com.amd.aparapi.Kernel; -import com.amd.aparapi.ProfileInfo; -import com.amd.aparapi.Range; +import com.amd.aparapi.*; + +import javax.swing.*; +import java.awt.*; +import java.awt.event.*; +import java.awt.image.*; +import java.util.List; /** * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. @@ -97,11 +87,8 @@ public class Main2D{ /** * Initialize the Kernel. - * - * @param _width Mandelbrot image width - * @param _height Mandelbrot image height + * * @param _rgb Mandelbrot image RGB buffer - * @param _pallette Mandelbrot image palette */ public MandelKernel(int[] _rgb) { rgb = _rgb; @@ -209,8 +196,7 @@ public class Main2D{ System.arraycopy(rgb, 0, imageRgb, 0, rgb.length); viewer.repaint(); - // Report target execution mode: GPU or JTP (Java Thread Pool). - System.out.println("Execution mode=" + kernel.getExecutionMode()); + System.out.println("device=" + kernel.getTargetDevice()); // Window listener to dispose Kernel resources on user exit. frame.addWindowListener(new WindowAdapter(){ diff --git a/samples/mdarray/build.xml b/samples/mdarray/build.xml index 7c5bf8ec6b3363b236090b2fd542ad50cd23766f..787fd0950e69d37599e2ba0b855c16714fa19710 100644 --- a/samples/mdarray/build.xml +++ b/samples/mdarray/build.xml @@ -19,7 +19,7 @@ <target name="build" depends="clean"> <mkdir dir="classes" /> - <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" fork="true" memorymaximumsize="3G"> + <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" fork="true" memorymaximumsize="1024m"> <classpath> <pathelement path="../../com.amd.aparapi/dist/aparapi.jar" /> </classpath> diff --git a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java index 99fa6259ff541663b292bc0e0c29aaf6709d61c3..f4e3e28c5d7e748613d067ba6e76dbe018429b6e 100644 --- a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java +++ b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java @@ -1,33 +1,52 @@ package com.amd.aparapi.sample.median; -import com.amd.aparapi.Kernel; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; import javax.imageio.*; import javax.swing.*; import java.awt.*; import java.awt.image.*; import java.io.*; +import java.util.*; /** * Demonstrate use of __private namespaces and @NoCL annotations. */ public class MedianDemo { - public final static BufferedImage testImage; + public static BufferedImage testImage; static { try { - File imageFile = new File("./../../../samples/convolution/testcard.jpg").getCanonicalFile(); - testImage = ImageIO.read(imageFile); + File imageFile = new File("./samples/convolution/testcard.jpg").getCanonicalFile(); + if (imageFile.exists()) { + testImage = ImageIO.read(imageFile); + } } catch (IOException e) { throw new RuntimeException(e); } } - private static final boolean TEST_JTP = false; + private static final boolean TEST_JTP = true; public static void main(String[] ignored) { final int size = 5; System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "true"); + boolean verbose = true; + if (verbose) + { + System.setProperty("com.amd.aparapi.enableVerboseJNI", "true"); + System.setProperty("com.amd.aparapi.dumpFlags", "true"); + System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "true"); + System.setProperty("com.amd.aparapi.enableVerboseJNIOpenCLResourceTracking", "true"); + System.setProperty("com.amd.aparapi.enableExecutionModeReporting", "true"); + } + + if (TEST_JTP) { + LinkedHashSet<Device> devices = new LinkedHashSet<>(Collections.singleton(JavaDevice.THREAD_POOL)); + KernelManager.instance().setDefaultPreferredDevices(devices); + } + int[] argbs = testImage.getRGB(0, 0, testImage.getWidth(), testImage.getHeight(), null, 0, testImage.getWidth()); MedianKernel7x7 kernel = new MedianKernel7x7(); kernel._imageTypeOrdinal = MedianKernel7x7.RGB; @@ -35,9 +54,7 @@ public class MedianDemo { kernel._sourceHeight = testImage.getHeight(); kernel._sourcePixels = argbs; kernel._destPixels = new int[argbs.length]; - if (TEST_JTP) { - kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP); - } + kernel.processImages(new MedianSettings(size)); BufferedImage out = new BufferedImage(testImage.getWidth(), testImage.getHeight(), BufferedImage.TYPE_INT_RGB); out.setRGB(0, 0, testImage.getWidth(), testImage.getHeight(), kernel._destPixels, 0, testImage.getWidth()); diff --git a/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java b/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java index 6cbece4157e3518a897c4e1bfead8fdf2ba7dbbd..c393720be7b4b200645d039ec0b28425f8d86e5b 100644 --- a/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java +++ b/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java @@ -28,7 +28,8 @@ public class MedianKernel7x7 extends Kernel { protected int[] _destPixels; // NB could also use suffix naming instead of annotation ... field would be named _window_$private$49 - @PrivateMemorySpace(MAX_WINDOW_SIZE) private short[] _window = new short[MAX_WINDOW_SIZE]; + @PrivateMemorySpace(MAX_WINDOW_SIZE) + private short[] _window = new short[MAX_WINDOW_SIZE]; @NoCL private static ThreadLocal<short[]> _threadLocalWindow = new ThreadLocal<short[]>() { @Override protected short[] initialValue() { diff --git a/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java b/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java index 7cc2584b1cb10d054f16632dd12ff27f2102c53b..7bfc91e4eea39ce1148611cfa428cdc6879a90bf 100644 --- a/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java +++ b/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java @@ -1,7 +1,7 @@ package com.amd.aparapi.sample.progress; -import com.amd.aparapi.Kernel; -import com.amd.aparapi.internal.kernel.KernelRunner; +import com.amd.aparapi.*; +import com.amd.aparapi.internal.kernel.*; import com.amd.aparapi.util.swing.MultiPassKernelSwingWorker; import javax.swing.*; @@ -23,13 +23,13 @@ public class MultiPassKernelSwingWorkerDemo { private static LongRunningKernel kernel; private static MultiPassKernelSwingWorker worker; - private static final boolean TEST_JTP = true; + private static final boolean TEST_JTP = false; public static void main(String[] ignored) throws Exception { - kernel = new LongRunningKernel(); if (TEST_JTP) { - kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP); + KernelManager.setKernelManager(KernelManagers.JTP_ONLY); } + kernel = new LongRunningKernel(); UIManager.setLookAndFeel(NimbusLookAndFeel.class.getName()); JPanel rootPanel = new JPanel(); diff --git a/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java b/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java index b114dcac4f19b5d93e6ec82b1d84da19193fa719..721f2c611ee06bf1fd3a144aedc16262785d84b5 100644 --- a/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java +++ b/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java @@ -1,13 +1,11 @@ package com.amd.aparapi.sample.progress; -import com.amd.aparapi.Kernel; -import com.amd.aparapi.internal.kernel.KernelRunner; +import com.amd.aparapi.internal.kernel.*; import javax.swing.*; -import javax.swing.plaf.nimbus.NimbusLookAndFeel; +import javax.swing.plaf.nimbus.*; import java.awt.*; -import java.awt.event.ActionEvent; -import java.awt.event.ActionListener; +import java.awt.event.*; /** * Demonstrates progress tracking and cancellation for multi-pass kernels. @@ -36,7 +34,7 @@ public class ProgressAndCancelDemo { kernel = new LongRunningKernel(); if (TEST_JTP) { - kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP); + KernelManager.setKernelManager(KernelManagers.JTP_ONLY); } Thread asynchReader = new Thread() { @Override diff --git a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java index 32a1b70b8bfd16cd76eff8d5666442738a18dc72..247cda6f8ea46601b339efe22b8144dc94b88b8b 100644 --- a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java +++ b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java @@ -82,7 +82,7 @@ public class Main{ kernel.execute(Range.create(512)); // Report target execution mode: GPU or JTP (Java Thread Pool). - System.out.println("Execution mode=" + kernel.getExecutionMode()); + System.out.println("Device = " + kernel.getTargetDevice().getShortDescription()); // Display computed square values. for (int i = 0; i < size; i++) { diff --git a/test/codegen/src/java/com/amd/aparapi/Source.java b/test/codegen/src/java/com/amd/aparapi/Source.java index a08c2872186874a4d7aef3c387116130718b3770..d9774096ed5499de4435e7da3e095b0c00d53bde 100644 --- a/test/codegen/src/java/com/amd/aparapi/Source.java +++ b/test/codegen/src/java/com/amd/aparapi/Source.java @@ -84,7 +84,7 @@ public class Source{ public Source(Class<?> _clazz, File _rootDir) { clazz = _clazz; - String srcName = clazz.getPackage().getName().replace(".", "/") + "/" + clazz.getSimpleName() + ".java"; + String srcName = clazz.getPackage().getName().replace(".", "/") + "/" + clazz + ".java"; file = new File(_rootDir, srcName); try { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java index 847a69575859dce0ab6d894f34cd9812270a7f36..1f9a36fa893ca1d383606c82cf52538f11eec61c 100644 --- a/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java +++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java @@ -1,17 +1,13 @@ package com.amd.aparapi.test.runtime; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; +import org.junit.*; -import java.util.Arrays; +import java.util.*; -import org.junit.BeforeClass; -import org.junit.Test; - -import com.amd.aparapi.Kernel; -import com.amd.aparapi.Range; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.device.OpenCLDevice; +import static org.junit.Assert.*; public class BufferTransfer{ @@ -19,7 +15,7 @@ public class BufferTransfer{ @BeforeClass public static void setUpBeforeClass() throws Exception { - Device device = Device.best(); + Device device = KernelManager.instance().bestDevice(); if (device == null || !(device instanceof OpenCLDevice)) { fail("no opencl device!"); } @@ -209,7 +205,7 @@ public class BufferTransfer{ for (int n = 0; n < neuronOutputs.length; n++) log[n][simStep[0]] = neuronOutputs[n]; } - System.out.println(getExecutionMode() + (isExplicit() ? ", explicit" : ", auto")); + System.out.println(getTargetDevice().getShortDescription() + (isExplicit() ? ", explicit" : ", auto")); for (int n = 0; n < neuronOutputs.length; n++) System.out.println(Arrays.toString(log[n])); diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java index ca70f1a2ec6c39b6da1114a1cd1f39262c5af4f0..8cfb0d251027af33dff7c4b884055a94c7a03adb 100644 --- a/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java +++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java @@ -1,8 +1,10 @@ package com.amd.aparapi.test.runtime; -import static org.junit.Assert.assertTrue; -import org.junit.Test; -import com.amd.aparapi.Kernel; +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import org.junit.*; + +import static org.junit.Assert.*; class AnotherClass{ static public int foo(int i) { @@ -42,7 +44,7 @@ public class CallStaticFromAnonymousKernel{ } }; kernel.execute(size); - assertTrue("ran on GPU", kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU); + assertTrue("ran on GPU", kernel.getTargetDevice().getType() == Device.TYPE.GPU); for (int i = 0; i < size; i++) { assertTrue("results == fooBar", results[i] == (fooBar(values[i]) + AnotherClass.foo(i))); diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java index c80b587ba5fff03255756764f1256c7eaab0a44a..c59efbd9f90b1fce79b1de38202f4084b8a0ed5f 100644 --- a/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java +++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java @@ -1,10 +1,9 @@ package com.amd.aparapi.test.runtime; -import static org.junit.Assert.assertTrue; +import com.amd.aparapi.*; +import org.junit.*; -import org.junit.Test; - -import com.amd.aparapi.Kernel; +import static org.junit.Assert.*; public class ExplicitBoolean{ @@ -61,7 +60,7 @@ public class ExplicitBoolean{ printArray(k2.output); assertTrue("k1.input == k2.input", Util.same(k1.output, k1.output)); - System.out.println(k1.getExecutionMode()); + System.out.println(k1.getTargetDevice().getShortDescription()); } private static void printArray(boolean[] a) { diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java index 28b2a73b50b76eba3ecee1751eaecdefaaaacf22..99d1764c9857952a461135f27ed89ab46cd12ba2 100644 --- a/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java +++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java @@ -1,14 +1,13 @@ package com.amd.aparapi.test.runtime; -import com.amd.aparapi.Range; -import com.amd.aparapi.device.Device; -import com.amd.aparapi.device.OpenCLDevice; -import com.amd.aparapi.opencl.OpenCL; -import com.amd.aparapi.opencl.OpenCL.Resource; -import com.amd.aparapi.opencl.OpenCL.Source; +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.kernel.*; +import com.amd.aparapi.opencl.*; +import com.amd.aparapi.opencl.OpenCL.*; +import org.junit.*; -import static org.junit.Assert.assertTrue; -import org.junit.Test; +import static org.junit.Assert.*; public class LoadCL{ @@ -31,7 +30,7 @@ public class LoadCL{ final float[] quads = new float[size]; final Range range = Range.create(size); - final Device device = Device.best(); + final Device device = KernelManager.instance().bestDevice(); if (device instanceof OpenCLDevice) { final OpenCLDevice openclDevice = (OpenCLDevice) device; diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java index 51cee4366bdc15dc648cfb647976f9eb7cf423b0..b415b7764a36e6cbb210a3df724b8752802998a2 100644 --- a/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java +++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java @@ -1,12 +1,14 @@ package com.amd.aparapi.test.runtime; +import com.amd.aparapi.device.*; import org.junit.Test; import com.amd.aparapi.Kernel; import com.amd.aparapi.Range; public class Test12x4_4x2{ - @Test public void test() { + @SuppressWarnings("deprecation") + @Test public void test() { // globalThreadId, threadId, globalX, globalY, localX, localY final int[][] test = new int[][] { { @@ -446,7 +448,12 @@ public class Test12x4_4x2{ }; Kernel kernel = new Kernel(){ - @Override public void run() { + @Override + public boolean isAllowDevice(Device _device) { + return _device.getType() == Device.TYPE.JTP; + } + + @Override public void run() { int x = getGlobalId(0); int y = getGlobalId(1); int lx = getLocalId(0); @@ -492,7 +499,6 @@ public class Test12x4_4x2{ } }; - kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP); kernel.execute(Range.create2D(12, 4, 4, 2)); } diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java index 41f4b0d21e02207f9ad02621c06f2776a67bf5fd..5ce32645e4c77beb6776101754dc2918f5c0d743 100644 --- a/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java +++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java @@ -1,8 +1,10 @@ package com.amd.aparapi.test.runtime; +import com.amd.aparapi.*; +import com.amd.aparapi.device.*; +import org.junit.*; + import static org.junit.Assert.*; -import org.junit.Test; -import com.amd.aparapi.Kernel; public class UseStaticArray extends Kernel{ @@ -26,7 +28,7 @@ public class UseStaticArray extends Kernel{ execute(size); - assertTrue("ran on GPU", getExecutionMode() == Kernel.EXECUTION_MODE.GPU); + assertTrue("ran on GPU", getTargetDevice().getType() == Device.TYPE.GPU); assertArrayEquals("results == fooBar", results, values); // for (int i = 0; i < size; i++) {