From 2fb33514e9603e1e9460b3aac7bf4e0890a071b5 Mon Sep 17 00:00:00 2001 From: Barney <barney@frontofficedeveloper.com> Date: Mon, 28 Sep 2015 11:56:56 +0100 Subject: [PATCH] Improvements to profiling apparatus Added JNI-side caching of compiled binaries per kernel per device Added java-side support for control of binary caching Added facility to cleanUpArrays() in a kernel, frees (most of) GPU memory associated with a kernel but retains JINContext to allow kernel reuse --- com.amd.aparapi.jni/src/cpp/CLHelper.cpp | 64 ++++++++- com.amd.aparapi.jni/src/cpp/CLHelper.h | 2 +- .../src/cpp/invoke/OpenCLJNI.cpp | 4 +- .../src/cpp/runKernel/Aparapi.cpp | 6 +- .../src/java/com/amd/aparapi/Config.java | 8 ++ .../src/java/com/amd/aparapi/Kernel.java | 35 +++++ .../src/java/com/amd/aparapi/Range.java | 5 + .../aparapi/internal/jni/KernelArgJNI.java | 45 +++---- .../aparapi/internal/jni/KernelRunnerJNI.java | 10 +- .../internal/kernel/KernelDeviceProfile.java | 24 ++-- .../internal/kernel/KernelManager.java | 13 +- .../internal/kernel/KernelProfile.java | 21 +-- .../aparapi/internal/kernel/KernelRunner.java | 126 +++++++++++++----- .../internal/kernel/ProfilingEvent.java | 2 +- .../aparapi/internal/model/ClassModel.java | 42 +++--- .../amd/aparapi/internal/model/Memoizer.java | 6 +- .../aparapi/internal/model/ValueCache.java | 8 +- .../amd/aparapi/sample/blackscholes/Main.java | 12 +- .../configuration/AutoCleanUpArraysDemo.java | 20 +++ .../configuration/CleanUpArraysDemo.java | 25 ++++ .../configuration/ConfigurationDemo.java | 6 +- .../sample/configuration/ProfilingDemo.java | 83 ++++++++++++ .../ProfilingDemoNoBinaryCaching.java | 14 ++ .../sample/convolution/Convolution.java | 8 +- .../com/amd/aparapi/sample/mandel/Main.java | 25 +++- .../com/amd/aparapi/sample/mandel/Main2D.java | 1 + .../amd/aparapi/sample/median/MedianDemo.java | 57 +++++--- 27 files changed, 520 insertions(+), 152 deletions(-) create mode 100644 samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java create mode 100644 samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java create mode 100644 samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java create mode 100644 samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java diff --git a/com.amd.aparapi.jni/src/cpp/CLHelper.cpp b/com.amd.aparapi.jni/src/cpp/CLHelper.cpp index 1d0752e7..acf91311 100644 --- a/com.amd.aparapi.jni/src/cpp/CLHelper.cpp +++ b/com.amd.aparapi.jni/src/cpp/CLHelper.cpp @@ -40,6 +40,8 @@ #include "CLHelper.h" #include "List.h" #include <map> +#include <vector> +#include <stdio.h> void setMap(std::map<cl_int, const char*>& errorMap) { errorMap[CL_SUCCESS] = "success"; @@ -129,14 +131,62 @@ void CLHelper::getBuildErr(JNIEnv *jenv, cl_device_id deviceId, cl_program prog delete []buildLog; } -cl_program CLHelper::compile(JNIEnv *jenv, cl_context context, size_t deviceCount, cl_device_id* deviceIds, jstring source, jstring* log, cl_int* status){ - const char *sourceChars = jenv->GetStringUTFChars(source, NULL); - size_t sourceSize[] = { strlen(sourceChars) }; - cl_program program = clCreateProgramWithSource(context, 1, &sourceChars, sourceSize, status); - jenv->ReleaseStringUTFChars(source, sourceChars); - *status = clBuildProgram(program, deviceCount, deviceIds, NULL, NULL, NULL); +cl_program CLHelper::compile(JNIEnv *jenv, cl_context context, cl_device_id* deviceId, jstring* source, jstring* binaryKey, jstring* log, cl_int* status){ + using std::map; + using std::vector; + using std::string; + + static map<string, vector<unsigned char *> > src2bin; + static map<string, vector<size_t> > src2len; + + const char* sourceChars = jenv->GetStringUTFChars(*source, NULL); + const char* keyChars = jenv->GetStringUTFChars(*binaryKey, NULL); + string sourceStr(sourceChars); + string keyStr(keyChars); + + size_t sourceLength[] = {sourceStr.length()}; + + bool cacheDisabled = jenv->GetStringLength(*binaryKey) == 0; + + cl_program program; + bool is_built_from_source = false; + bool keyNotFound = src2bin.find(keyStr) == src2bin.end(); + + if (cacheDisabled || keyNotFound) { + is_built_from_source = true; + program = clCreateProgramWithSource(context, 1, &sourceChars, sourceLength, status); + } + else{ + cl_int *binary_status = new cl_int[1]; + program = clCreateProgramWithBinary(context, 1, deviceId, &src2len[keyStr][0], (const unsigned char**)&src2bin[keyStr][0], binary_status, NULL); + cl_int theStatus = binary_status[0]; + if (theStatus != CL_SUCCESS) { + getBuildErr(jenv, *deviceId, program, log); + } + delete[] binary_status; + } + + jenv->ReleaseStringUTFChars(*source, sourceChars); + jenv->ReleaseStringUTFChars(*binaryKey, keyChars); + + *status = clBuildProgram(program, 1, deviceId, NULL, NULL, NULL); if(*status == CL_BUILD_PROGRAM_FAILURE) { - getBuildErr(jenv, *deviceIds, program, log); + getBuildErr(jenv, *deviceId, program, log); + } + + if(is_built_from_source && !cacheDisabled) { + vector<unsigned char *> &bins = src2bin[keyStr]; + vector<size_t> &lens = src2len[keyStr]; + + bins.resize(1); + lens.resize(1); + + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &lens[0], NULL); + for(size_t i = 0; i < 1; ++i){ + bins[i] = new unsigned char[lens[i]]; + } + + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bins[0], NULL); } return(program); } diff --git a/com.amd.aparapi.jni/src/cpp/CLHelper.h b/com.amd.aparapi.jni/src/cpp/CLHelper.h index d1581e9e..71f761ef 100644 --- a/com.amd.aparapi.jni/src/cpp/CLHelper.h +++ b/com.amd.aparapi.jni/src/cpp/CLHelper.h @@ -45,7 +45,7 @@ class CLHelper{ public: static const char *errString(cl_int status); static void getBuildErr(JNIEnv *jenv, cl_device_id deviceId, cl_program program, jstring *log); - static cl_program compile(JNIEnv *jenv, cl_context context, size_t deviceCount, cl_device_id* deviceId, jstring source, jstring* log, cl_int *status); + static cl_program compile(JNIEnv *jenv, cl_context context, cl_device_id* deviceId, jstring* source, jstring* binaryKey, jstring* log, cl_int *status); static jstring getExtensions(JNIEnv *jenv, cl_device_id deviceId, cl_int *status); }; diff --git a/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp b/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp index ccfa62bf..b637a390 100644 --- a/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp +++ b/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp @@ -88,7 +88,7 @@ void OpenCLRange::fill(JNIEnv *jenv, jobject rangeInstance, jint dims, size_t* o } JNI_JAVA(jobject, OpenCLJNI, createProgram) - (JNIEnv *jenv, jobject jobj, jobject deviceInstance, jstring source) { + (JNIEnv *jenv, jobject jobj, jobject deviceInstance, jstring source, jstring binaryKey) { jobject platformInstance = OpenCLDevice::getPlatformInstance(jenv, deviceInstance); cl_platform_id platformId = OpenCLPlatform::getPlatformId(jenv, platformInstance); @@ -105,7 +105,7 @@ JNI_JAVA(jobject, OpenCLJNI, createProgram) jstring log=NULL; - cl_program program = CLHelper::compile(jenv, context, 1, &deviceId, source, &log, &status); + cl_program program = CLHelper::compile(jenv, context, &deviceId, &source, &binaryKey, &log, &status); cl_command_queue queue = NULL; if(status == CL_SUCCESS) { cl_command_queue_properties queue_props = CL_QUEUE_PROFILING_ENABLE; diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp index cbad7e81..eb404c52 100644 --- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp +++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp @@ -52,7 +52,7 @@ static const int PASS_ID_PREPARING_EXECUTION = -2; static const int PASS_ID_COMPLETED_EXECUTION = -1; -static const int CANCEL_STATUS_FALSE = 0; +static const int CANCEL_STATUS_FALSE = 0; static const int CANCEL_STATUS_TRUE = 1; //compiler dependant code @@ -1198,7 +1198,7 @@ void writeProfile(JNIEnv* jenv, JNIContext* jniContext) { } JNI_JAVA(jlong, KernelRunnerJNI, buildProgramJNI) - (JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) { + (JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source, jstring binaryKey) { JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); if (jniContext == NULL){ return 0; @@ -1207,7 +1207,7 @@ JNI_JAVA(jlong, KernelRunnerJNI, buildProgramJNI) try { cl_int status = CL_SUCCESS; - jniContext->program = CLHelper::compile(jenv, jniContext->context, 1, &jniContext->deviceId, source, NULL, &status); + jniContext->program = CLHelper::compile(jenv, jniContext->context, &jniContext->deviceId, &source, &binaryKey, NULL, &status); if(status == CL_BUILD_PROGRAM_FAILURE) throw CLException(status, ""); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java index fbae39bb..6f08fbd8 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java @@ -106,6 +106,14 @@ public class Config extends ConfigJNI{ */ public static final boolean dumpProfilesOnExit = Boolean.getBoolean(propPkgName + ".dumpProfilesOnExit"); + /** + * Dumps profiling info (for a single execution) after every Kernel execution. + * + * Usage -Dcom.amd.aparapi.dumpProfileOnExecution={true|false} + * + */ + public static final boolean dumpProfileOnExecution = Boolean.getBoolean(propPkgName + ".dumpProfileOnExecution"); + // Pragma/OpenCL codegen related flags public static final boolean enableAtomic32 = Boolean.getBoolean(propPkgName + ".enableAtomic32"); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java index 8bead23f..df9f7c46 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java @@ -464,6 +464,8 @@ public abstract class Kernel implements Cloneable { private KernelRunner kernelRunner = null; + private boolean autoCleanUpArrays = false; + private KernelState kernelState = new KernelState(); /** @@ -2110,6 +2112,33 @@ public abstract class Kernel implements Cloneable { return prepareKernelRunner().execute(_entrypoint, _range, _passes); } + public boolean isAutoCleanUpArrays() { + return autoCleanUpArrays; + } + + /** + * Property which if true enables automatic calling of {@link #cleanUpArrays()} following each execution. + */ + public void setAutoCleanUpArrays(boolean autoCleanUpArrays) { + this.autoCleanUpArrays = autoCleanUpArrays; + } + + /** + * Frees the bulk of the resources used by this kernel, by setting array sizes in non-primitive {@link KernelArg}s to 1 (0 size is prohibited) and invoking kernel + * execution on a zero size range. Unlike {@link #dispose()}, this does not prohibit further invocations of this kernel, as sundry resources such as OpenCL queues are + * <b>not</b> freed by this method. + * + * <p>This allows a "dormant" Kernel to remain in existence without undue strain on GPU resources, which may be strongly preferable to disposing a Kernel and + * recreating another one later, as creation/use of a new Kernel (specifically creation of its associated OpenCL context) is expensive.</p> + * + * <p>Note that where the underlying array field is declared final, for obvious reasons it is not resized to zero.</p> + */ + public synchronized void cleanUpArrays() { + if (kernelRunner != null) { + kernelRunner.cleanUpArrays(); + } + } + /** * Release any resources associated with this Kernel. * <p> @@ -2125,6 +2154,12 @@ public abstract class Kernel implements Cloneable { } } + /** Automatically releases any resources associated with this Kernel when the Kernel is garbage collected. */ + @Override + protected void finalize() { + dispose(); + } + public boolean isRunningCL() { return getTargetDevice() instanceof OpenCLDevice; } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java index 75db2c24..3d6aef9a 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java @@ -140,6 +140,11 @@ public class Range extends RangeJNI{ public static Range create(Device _device, int _globalWidth) { final Range withoutLocal = create(_device, _globalWidth, 1); + if (_globalWidth == 0) { + withoutLocal.setLocalIsDerived(true); + return withoutLocal; + } + if (withoutLocal.isValid()) { withoutLocal.setLocalIsDerived(true); final int[] factors = getFactors(withoutLocal.getGlobalSize_0(), withoutLocal.getMaxWorkItemSize()[0]); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java index 270321ff..895d1ff9 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java @@ -1,8 +1,8 @@ package com.amd.aparapi.internal.jni; -import java.lang.reflect.Field; +import com.amd.aparapi.internal.annotation.*; -import com.amd.aparapi.internal.annotation.UsedByJNICode; +import java.lang.reflect.*; /** * This class is intended to be used as a 'proxy' or 'facade' object for Java code to interact with JNI @@ -12,28 +12,25 @@ public abstract class KernelArgJNI{ /** * The type of this KernelArg. Created by or-ing appropriate flags * - * @see ARG_BOOLEAN - * @see ARG_BYTE - * @see ARG_CHAR - * @see ARG_FLOAT - * @see ARG_INT - * @see ARG_DOUBLE - * @see ARG_LONG - * @see ARG_SHORT - * @see ARG_ARRAY - * @see ARG_PRIMITIVE - * @see ARG_READ - * @see ARG_WRITE - * @see ARG_LOCAL - * @see ARG_GLOBAL - * @see ARG_CONSTANT - * @see ARG_ARRAYLENGTH - * @see ARG_APARAPI_BUF - * @see ARG_EXPLICIT - * @see ARG_EXPLICIT_WRITE - * @see ARG_OBJ_ARRAY_STRUCT - * @see ARG_APARAPI_BUF_HAS_ARRAY - * @see ARG_APARAPI_BUF_IS_DIRECT + * @see KernelRunnerJNI#ARG_BOOLEAN + * @see KernelRunnerJNI#ARG_BYTE + * @see KernelRunnerJNI#ARG_CHAR + * @see KernelRunnerJNI#ARG_FLOAT + * @see KernelRunnerJNI#ARG_INT + * @see KernelRunnerJNI#ARG_DOUBLE + * @see KernelRunnerJNI#ARG_LONG + * @see KernelRunnerJNI#ARG_SHORT + * @see KernelRunnerJNI#ARG_ARRAY + * @see KernelRunnerJNI#ARG_PRIMITIVE + * @see KernelRunnerJNI#ARG_READ + * @see KernelRunnerJNI#ARG_WRITE + * @see KernelRunnerJNI#ARG_LOCAL + * @see KernelRunnerJNI#ARG_GLOBAL + * @see KernelRunnerJNI#ARG_CONSTANT + * @see KernelRunnerJNI#ARG_ARRAYLENGTH + * @see KernelRunnerJNI#ARG_EXPLICIT + * @see KernelRunnerJNI#ARG_EXPLICIT_WRITE + * @see KernelRunnerJNI#ARG_OBJ_ARRAY_STRUCT */ @UsedByJNICode protected int type; diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java index 923875ee..7b83bb9b 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java @@ -307,7 +307,15 @@ public abstract class KernelRunnerJNI{ protected native int getJNI(long _jniContextHandle, Object _array); - protected native long buildProgramJNI(long _jniContextHandle, String _source); + /** + * @param _source The OpenCL source code to compile, which may be sent empty if the binary for that source code is known to be cached on the JNI side + * under the key {@code _binaryKey}. + * @param _binaryKey A key which embodies a Kernel class and a Device, under which the JNI side will cache the compiled binary corresponding to that Kernel/Device + * pair. Once a certain _binaryKey has been passed to this method once, further calls to this method with that key will ignore the _source (which + * can be passed empty) andused the cached binary. + * <p>By passing an empty String as the _binaryKey, the entire JNI-side binary caching apparatus can be disabled. + */ + protected native long buildProgramJNI(long _jniContextHandle, String _source, String _binaryKey); protected native int setArgsJNI(long _jniContextHandle, KernelArgJNI[] _args, int argc); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java index 577c13d9..55c4ee50 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java @@ -17,6 +17,7 @@ public class KernelDeviceProfile { private static final int TABLE_COLUMN_HEADER_WIDTH = 21; private static final int TABLE_COLUMN_COUNT_WIDTH = 8; private static final int TABLE_COLUMN_WIDTH; + private static String tableHeader = null; private final Class<? extends Kernel> kernel; private final Device device; private long[] currentTimes = new long[ProfilingEvent.values().length]; @@ -106,17 +107,20 @@ public class KernelDeviceProfile { return sum; } - public static String getTableHeader() { - int length = ProfilingEvent.values().length; - StringBuilder builder = new StringBuilder(150); - appendRowHeaders(builder, "Device", "Count"); - for (int i = 1; i < length; ++i) { - ProfilingEvent stage = ProfilingEvent.values()[i]; - String heading = stage.name(); - appendCell(builder, heading); + public static synchronized String getTableHeader() { + if (tableHeader == null) { + int length = ProfilingEvent.values().length; + StringBuilder builder = new StringBuilder(150); + appendRowHeaders(builder, "Device", "Count"); + for (int i = 1; i < length; ++i) { + ProfilingEvent stage = ProfilingEvent.values()[i]; + String heading = stage.name(); + appendCell(builder, heading); + } + builder.append(" ").append("Total"); + tableHeader = builder.toString(); } - builder.append(" ").append("Total"); - return builder.toString(); + return tableHeader; } public String getLastAsTableRow() { diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java index 3ced0ae0..2b5dc2e2 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java @@ -47,11 +47,16 @@ public class KernelManager { /** This method returns a shared instance of a given Kernel subclass. The kernelClass needs a no-args constructor, which * need not be public. * - * <p>Given that compilation of OpenCL is relatively expensive and that (currently!) there is no caching of compiled OpenCL - * it is desirable to only ever create one instance of any given Kernel subclass, which this method facilitates.</p> + * <p>Each new Kernel instance requires a new JNIContext, the creation of which is expensive. There is apparently no simple solution by which a cached JNIContext can be reused + * for all instances of a given Kernel class, since it is intimately connected with resource aquisition and release. In the absence of a context caching solution, it is often + * highly desirable to only ever create one instance of any given Kernel subclass, which this method facilitates.</p> * - * <p>In order to maintain thread saftey, it is necessary to synchronize on the returned kernel for the duration of the process of setting up, - * executing and extracting the results from that kernel, when using a shared instance.</p> + * <p>In order to maintain thread saftey when using a shared instance, it is necessary to synchronize on the returned kernel for the duration of the process of setting up, + * executing and extracting the results from that kernel.</p> + * + * <p>This method instantiates a Kernel (per Kernel class) via Reflection, and thus can only be used where the Kernel class has a no-args constructor, which need not be public. + * In fact, if a Kernel subclass is designed to be used in conjunction with this method, it is recommended that its <b>only</b> constructor is a <b>private</b> no-args constructor. + * </p> * * @throws RuntimeException if the class cannot be instantiated */ diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java index 647ab513..3d1caaa1 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java @@ -12,6 +12,7 @@ import java.util.logging.*; */ public class KernelProfile { + private static final double MILLION = 1000000d; private static Logger logger = Logger.getLogger(Config.getLoggerName()); private final Class<? extends Kernel> kernelClass; private LinkedHashMap<Device, KernelDeviceProfile> deviceProfiles = new LinkedHashMap<>(); @@ -25,12 +26,13 @@ public class KernelProfile { public double getLastExecutionTime() { KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile(); - return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED); + return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED) / MILLION; } public double getLastConversionTime() { KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile(); - return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED); } + return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED) / MILLION; + } public double getAccumulatedTotalTime() { KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile(); @@ -38,12 +40,12 @@ public class KernelProfile { return Double.NaN; } else { - return lastDeviceProfile.getCumulativeElapsedTimeAll(); + return lastDeviceProfile.getCumulativeElapsedTimeAll() / MILLION; } } - private KernelDeviceProfile getLastDeviceProfile() { - return null; + public KernelDeviceProfile getLastDeviceProfile() { + return deviceProfiles.get(currentDevice); } void onStart(Device device) { @@ -61,10 +63,11 @@ public class KernelProfile { void onEvent(ProfilingEvent event) { switch (event) { case CLASS_MODEL_BUILT: // fallthrough - case OPENCL_GENERATED: // fallthrough - case OPENCL_COMPILED: // fallthrough - case PREPARE_EXECUTE: // fallthrough - case EXECUTED: // fallthrough + case OPENCL_GENERATED: // fallthrough + case INIT_JNI: // fallthrough + case OPENCL_COMPILED: // fallthrough + case PREPARE_EXECUTE: // fallthrough + case EXECUTED: // fallthrough { if (currentDeviceProfile == null) { logger.log(Level.SEVERE, "Error in KernelProfile, no currentDevice (synchronization error?"); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java index f162d695..7f250d0f 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java @@ -75,6 +75,10 @@ import java.util.logging.*; */ public class KernelRunner extends KernelRunnerJNI{ + public static boolean BINARY_CACHING_DISABLED = false; + + private static final int MINIMUM_ARRAY_SIZE = 1; + /** @see #getCurrentPass() */ @UsedByJNICode public static final int PASS_ID_PREPARING_EXECUTION = -2; /** @see #getCurrentPass() */ @@ -129,6 +133,7 @@ public class KernelRunner extends KernelRunnerJNI{ private static final ForkJoinPool threadPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(), lowPriorityThreadFactory, null, false); private static HashMap<Class<? extends Kernel>, String> openCLCache = new HashMap<>(); + private static LinkedHashSet<String> seenBinaryKeys = new LinkedHashSet<>(); /** * Create a KernelRunner for a specific Kernel instance. @@ -147,7 +152,34 @@ public class KernelRunner extends KernelRunnerJNI{ inBufferRemoteInt = inBufferRemote.asIntBuffer(); outBufferRemoteInt = outBufferRemote.asIntBuffer(); - KernelManager.instance(); // ensures static initialization of KernalManager + KernelManager.instance(); // ensures static initialization of KernelManager + } + + /** + * @see Kernel#cleanUpArrays(). + */ + public void cleanUpArrays() { + if (args != null && kernel.isRunningCL()) { + for (KernelArg arg : args) { + if ((arg.getType() & KernelRunnerJNI.ARG_ARRAY) != 0) { + Field field = arg.getField(); + if (field != null && field.getType().isArray() && !Modifier.isFinal(field.getModifiers())) { + field.setAccessible(true); + Class<?> componentType = field.getType().getComponentType(); + Object newValue = Array.newInstance(componentType, MINIMUM_ARRAY_SIZE); + try { + field.set(kernel, newValue); + } + catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + } + } + } + kernel.execute(0); + } else if (kernel.isRunningCL()) { + logger.log(Level.SEVERE, "KernelRunner#cleanUpArrays() could not execute as no args available (Kernel has not been executed?)"); + } } /** @@ -156,7 +188,7 @@ public class KernelRunner extends KernelRunnerJNI{ * @see KernelRunnerJNI#disposeJNI(long) */ public void dispose() { - if (kernel.isRunningCL()) { + if (args != null || kernel.isRunningCL()) { disposeJNI(jniContextHandle); } // We are using a shared pool, so there's no need no shutdown it when kernel is disposed @@ -1005,7 +1037,7 @@ public class KernelRunner extends KernelRunnerJNI{ kernel.setFallbackExecutionMode(); } recreateRange(_settings); - return executeInternal(_settings); + return executeInternalInner(_settings); } private void recreateRange(ExecutionSettings _settings) { @@ -1075,33 +1107,23 @@ public class KernelRunner extends KernelRunnerJNI{ } recreateRange(_settings); - return executeInternal(_settings); - } - - private String describeDevice() { - Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel); - return (device == null) ? "<default fallback>" : device.getShortDescription(); - } - - @Override - public String toString() { - return "KernelRunner{" + kernel + "}"; + return executeInternalInner(_settings); } @SuppressWarnings("deprecation") public synchronized Kernel execute(String _entrypoint, final Range _range, final int _passes) { executing = true; - clearCancelMultiPass(); - KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass()); - KernelPreferences preferences = KernelManager.instance().getPreferences(kernel); - boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO; - - ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode); try { + clearCancelMultiPass(); + KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass()); + KernelPreferences preferences = KernelManager.instance().getPreferences(kernel); + boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO; + + ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode); // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously. synchronized (preferences) { - return executeInternal(settings); + return executeInternalOuter(settings); } } finally { executing = false; @@ -1109,8 +1131,18 @@ public class KernelRunner extends KernelRunnerJNI{ } } + private synchronized Kernel executeInternalOuter(ExecutionSettings _settings) { + try { + return executeInternalInner(_settings); + } finally { + if (kernel.isAutoCleanUpArrays() &&_settings.range.getGlobalSize_0() != 0) { + cleanUpArrays(); + } + } + } + @SuppressWarnings("deprecation") - private synchronized Kernel executeInternal(ExecutionSettings _settings) { + private synchronized Kernel executeInternalInner(ExecutionSettings _settings) { if (_settings.range == null) { throw new IllegalStateException("range can't be null"); @@ -1119,7 +1151,7 @@ public class KernelRunner extends KernelRunnerJNI{ EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode(); if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) { - fallBackToNextDevice(_settings, "OpenCL was requested but Device supplied was not an OpenCLDevice"); + fallBackToNextDevice(_settings, "OpenCL EXECUTION_MODE was requested but Device supplied was not an OpenCLDevice"); } Device device = _settings.range.getDevice(); @@ -1151,9 +1183,6 @@ public class KernelRunner extends KernelRunnerJNI{ OpenCLDevice openCLDevice = device instanceof OpenCLDevice ? (OpenCLDevice) device : null; int jniFlags = 0; - if (_settings.legacyExecutionMode && device != null && !(device instanceof OpenCLDevice)) { - hashCode(); - } // for legacy reasons use old logic where Kernel.EXECUTION_MODE is not AUTO if (_settings.legacyExecutionMode && !userSpecifiedDevice && requestedExecutionMode.isOpenCL()) { if (requestedExecutionMode.equals(EXECUTION_MODE.GPU)) { @@ -1214,9 +1243,8 @@ public class KernelRunner extends KernelRunnerJNI{ // jniFlags |= (kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0); // Init the device to check capabilities before emitting the // code that requires the capabilities. - - // synchronized(Kernel.class){ jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here + _settings.profile.onEvent(ProfilingEvent.INIT_JNI); } // end of synchronized! issue 68 if (jniContextHandle == 0) { @@ -1282,8 +1310,26 @@ public class KernelRunner extends KernelRunnerJNI{ } } - // Send the string to OpenCL to compile it - long handle = buildProgramJNI(jniContextHandle, openCL); + // Send the string to OpenCL to compile it, or if the compiled binary is already cached on JNI side just empty string to use cached binary + long handle; + if (BINARY_CACHING_DISABLED) { + handle = buildProgramJNI(jniContextHandle, openCL, ""); + } else { + synchronized (seenBinaryKeys) { + String binaryKey = kernel.getClass().getName() + ":" + device.getDeviceId(); + if (seenBinaryKeys.contains(binaryKey)) { + // use cached binary + logger.log(Level.INFO, "reusing cached binary for " + binaryKey); + handle = buildProgramJNI(jniContextHandle, "", binaryKey); + } + else { + // create and cache binary + logger.log(Level.INFO, "compiling new binary for " + binaryKey); + handle = buildProgramJNI(jniContextHandle, openCL, binaryKey); + seenBinaryKeys.add(binaryKey); + } + } + } _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED); if (handle == 0) { return fallBackToNextDevice(_settings, "OpenCL compile failed"); @@ -1446,6 +1492,26 @@ public class KernelRunner extends KernelRunnerJNI{ } finally { _settings.profile.onEvent(ProfilingEvent.EXECUTED); + maybeReportProfile(_settings); + } + } + + @Override + public String toString() { + return "KernelRunner{" + kernel + "}"; + } + + private String describeDevice() { + Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel); + return (device == null) ? "<default fallback>" : device.getShortDescription(); + } + + private void maybeReportProfile(ExecutionSettings _settings) { + if (Config.dumpProfileOnExecution) { + StringBuilder report = new StringBuilder(); + report.append(KernelDeviceProfile.getTableHeader()).append('\n'); + report.append(_settings.profile.getLastDeviceProfile().getLastAsTableRow()); + System.out.println(report); } } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java index fcb06bfd..4e1d01d0 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java @@ -4,5 +4,5 @@ package com.amd.aparapi.internal.kernel; * Created by Barney on 02/09/2015. */ public enum ProfilingEvent { - START, CLASS_MODEL_BUILT, OPENCL_GENERATED, OPENCL_COMPILED, PREPARE_EXECUTE, EXECUTED + START, CLASS_MODEL_BUILT, INIT_JNI, OPENCL_GENERATED, OPENCL_COMPILED, PREPARE_EXECUTE, EXECUTED } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java index 132f4f21..d3db6a62 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java @@ -65,9 +65,9 @@ import java.util.logging.*; * @author gfrost * */ -public class ClassModel{ +public class ClassModel { - public interface LocalVariableInfo{ + public interface LocalVariableInfo { int getStart(); @@ -142,6 +142,7 @@ public class ClassModel{ }); // private ValueCache<String, Integer, ClassParseException> privateMemorySizes = ValueCache.on(this::computePrivateMemorySize); + private ValueCache<String, Integer, ClassParseException> privateMemorySizes = ValueCache .on(new ThrowingValueComputer<String, Integer, ClassParseException>(){ @Override @@ -635,19 +636,25 @@ public class ClassModel{ return (methodDescription); } - // private static final ValueCache<Class<?>, ClassModel, ClassParseException> classModelCache = ValueCache.onIdentity(ClassModel::new); private static final ValueCache<Class<?>, ClassModel, ClassParseException> classModelCache = ValueCache .on(new ThrowingValueComputer<Class<?>, ClassModel, ClassParseException>(){ @Override public ClassModel compute(Class<?> key) throws ClassParseException { - return new ClassModel(key); + return createClassModelInternal(key); } }); + private static ClassModel createClassModelInternal(Class<?> key) throws ClassParseException { + ClassModel classModel = new ClassModel(key); + return classModel; + } + public static ClassModel createClassModel(Class<?> _class) throws ClassParseException { - if (CacheEnabler.areCachesEnabled()) + if (CacheEnabler.areCachesEnabled()) { return classModelCache.computeIfAbsent(_class); - return new ClassModel(_class); + } + + return createClassModelInternal(_class); } private int magic; @@ -746,7 +753,7 @@ public class ClassModel{ private final List<Entry> entries = new ArrayList<Entry>(); - public abstract class Entry{ + public abstract class Entry { private final ConstantPoolType constantPoolType; private final int slot; @@ -1559,7 +1566,7 @@ public class ClassModel{ } } - public class AttributePool{ + public class AttributePool { private final List<AttributePoolEntry> attributePoolEntries = new ArrayList<AttributePoolEntry>(); public class CodeEntry extends AttributePoolEntry{ @@ -1672,7 +1679,7 @@ public class ClassModel{ } } - public abstract class AttributePoolEntry{ + public abstract class AttributePoolEntry { protected int length; protected int nameIndex; @@ -1727,7 +1734,7 @@ public class ClassModel{ } public class InnerClassesEntry extends PoolEntry<InnerClassesEntry.InnerClassInfo>{ - public class InnerClassInfo{ + public class InnerClassInfo { private final int innerAccess; private final int innerIndex; @@ -1771,7 +1778,7 @@ public class ClassModel{ public class LineNumberTableEntry extends PoolEntry<LineNumberTableEntry.StartLineNumberPair>{ - public class StartLineNumberPair{ + public class StartLineNumberPair { private final int lineNumber; private final int start; @@ -2090,13 +2097,13 @@ public class ClassModel{ public class RuntimeAnnotationsEntry extends PoolEntry<RuntimeAnnotationsEntry.AnnotationInfo>{ - public class AnnotationInfo{ + public class AnnotationInfo { private final int typeIndex; private final int elementValuePairCount; public class ElementValuePair{ - class Value{ + class Value { Value(int _tag) { tag = _tag; } @@ -2383,7 +2390,7 @@ public class ClassModel{ private static ClassLoader classModelLoader = ClassModel.class.getClassLoader(); - public class ClassModelField{ + public class ClassModelField { private final int fieldAccessFlags; AttributePool fieldAttributePool; @@ -2450,7 +2457,7 @@ public class ClassModel{ } } - public class ClassModelMethod{ + public class ClassModelMethod { private final int methodAccessFlags; @@ -2554,7 +2561,7 @@ public class ClassModel{ } } - public class ClassModelInterface{ + public class ClassModelInterface { private final int interfaceIndex; ClassModelInterface(ByteReader _byteReader) { @@ -2805,7 +2812,10 @@ public class ClassModel{ Entrypoint getEntrypoint(String _entrypointName, String _descriptor, Object _k) throws AparapiException { if (CacheEnabler.areCachesEnabled()) { EntrypointKey key = EntrypointKey.of(_entrypointName, _descriptor); + long s = System.nanoTime(); Entrypoint entrypointWithoutKernel = entrypointCache.computeIfAbsent(key); + long e = System.nanoTime() - s; + System.out.println("newMethodModel: " + e / 1000000f); return entrypointWithoutKernel.cloneForKernel(_k); } else { final MethodModel method = getMethodModel(_entrypointName, _descriptor); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java index ece7e391..7eec09b7 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java @@ -1,7 +1,7 @@ package com.amd.aparapi.internal.model; -import java.util.NoSuchElementException; -import java.util.concurrent.atomic.AtomicReference; +import java.util.*; +import java.util.concurrent.atomic.*; interface Optional<E> { final class Some<E> implements Optional<E>{ @@ -49,7 +49,7 @@ interface Optional<E> { boolean isPresent(); } -public interface Memoizer<T> extends Supplier<T>{ +public interface Memoizer<T> extends Supplier<T> { public final class Impl<T> implements Memoizer<T>{ private final Supplier<T> supplier; diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java index ef66a53f..63906ed0 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java @@ -1,9 +1,7 @@ package com.amd.aparapi.internal.model; -import java.lang.ref.Reference; -import java.lang.ref.SoftReference; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; +import java.lang.ref.*; +import java.util.concurrent.*; //import java.util.function.Supplier; @@ -14,7 +12,7 @@ public final class ValueCache<K, V, T extends Throwable> { } // @FunctionalInterface - public interface ValueComputer<K, V> extends ThrowingValueComputer<K, V, RuntimeException>{ + public interface ValueComputer<K, V> extends ThrowingValueComputer<K, V, RuntimeException> { // Marker interface } diff --git a/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java b/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java index bdc14862..074ed2b0 100644 --- a/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java +++ b/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java @@ -76,7 +76,6 @@ public class Main{ /** * @brief Abromowitz Stegun approxmimation for PHI (Cumulative Normal Distribution Function) * @param X input value - * @param phi pointer to store calculated CND of X */ float phi(float X) { final float c1 = 0.319381530f; @@ -183,18 +182,15 @@ public class Main{ int size = Integer.getInteger("size", 512); Range range = Range.create(size); - int iterations = Integer.getInteger("iterations", 5); + int iterations = Integer.getInteger("iterations", 20); System.out.println("size =" + size); System.out.println("iterations =" + iterations); BlackScholesKernel kernel = new BlackScholesKernel(size); - long totalExecTime = 0; - long iterExecTime = 0; - /* for (int i = 0; i < iterations; i++) { - iterExecTime = kernel.execute(size).getExecutionTime(); - totalExecTime += iterExecTime; - }*/ + kernel.execute(size).getExecutionTime(); + } + kernel.execute(range, iterations); System.out.println("Average execution time " + kernel.getAccumulatedExecutionTime() / iterations); kernel.showResults(10); diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java new file mode 100644 index 00000000..c09d0ab2 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java @@ -0,0 +1,20 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.sample.mandel.*; + +public class AutoCleanUpArraysDemo { + public static void main(String[] ignored) { + + System.setProperty("com.amd.aparapi.dumpProfileOnExecution", "true"); + + int size = 1024; + int[] rgbs = new int[size * size]; + Main.MandelKernel kernel = new Main.MandelKernel(size, size, rgbs); + kernel.setAutoCleanUpArrays(true); + kernel.execute(size * size); + System.out.println("length = " + kernel.getRgbs().length); + kernel.resetImage(size, size, rgbs); + kernel.execute(size * size); + System.out.println("length = " + kernel.getRgbs().length); + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java new file mode 100644 index 00000000..26d832f4 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java @@ -0,0 +1,25 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.sample.mandel.*; + +public class CleanUpArraysDemo { + public static void main(String[] ignored) { + + System.setProperty("com.amd.aparapi.enableVerboseJNI", "true"); + System.setProperty("com.amd.aparapi.enableVerboseJNIOpenCLResourceTracking", "true"); + System.setProperty("com.amd.aparapi.enableExecutionModeReporting", "true"); + System.setProperty("com.amd.aparapi.dumpProfileOnExecution", "true"); + + int size = 1024; + int[] rgbs = new int[size * size]; + Main.MandelKernel kernel = new Main.MandelKernel(size, size, rgbs); + kernel.execute(size * size); + System.out.println("length = " + kernel.getRgbs().length); + System.out.println("Cleaning up arrays"); + kernel.cleanUpArrays(); + System.out.println("length = " + kernel.getRgbs().length); + kernel.resetImage(size, size, rgbs); + kernel.execute(size * size); + System.out.println("length = " + kernel.getRgbs().length); + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java index bdfb3cf2..67d7cc02 100644 --- a/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java @@ -10,16 +10,14 @@ import java.util.*; */ public class ConfigurationDemo { public static void main(String[] ignored) { - System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true"); - StringBuilder report; List<Integer> tests = Arrays.asList(0, 1, 2, 3); - int reps = 300; + int reps = 1; for (int rep = 0; rep < reps; ++rep) { runTests(rep == 0, tests); - if (rep % 100 == 99 || rep == 0) { + if (rep % 100 == 99 || rep == 0 || rep == reps - 1) { report = new StringBuilder("rep = " + rep + "\n"); KernelManager.instance().reportDeviceUsage(report, true); System.out.println(report); diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java new file mode 100644 index 00000000..aeea4ea5 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java @@ -0,0 +1,83 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.*; +import com.amd.aparapi.internal.kernel.*; +import com.amd.aparapi.sample.blackscholes.Main.*; +import com.amd.aparapi.sample.mandel.*; + +/** + * Demonstrate new enhanced profiling capability, profiling the kernel from the blackscholes sample. + */ +public class ProfilingDemo { + + private static BlackScholesKernel kernel; + + public static void main(String[] ignored) { + + final int size = 1024; + newBlackScholesKernel(size); + + // first execute an arbitrary Kernel (not the one we are profiling!) a few times to ensure class loading and initial JIT optimisations have + // been performed before we start the profiling + int warmups = 5; + for (int i = 0; i < warmups; ++i) { + runWarmup(); + } + + String tableHeader = KernelDeviceProfile.getTableHeader(); + + boolean newKernel = false; + + runOnce(size, newKernel); + System.out.println("First run:"); + printLastProfile(tableHeader); + + + int reps = 20; + + System.out.println("\nSubsequent runs using same kernel:"); + for (int rep = 0; rep < reps; ++rep) { + runOnce(size, newKernel); + printLastProfile(tableHeader); + } + + newKernel = true; + System.out.println("\nSubsequent runs using new kernels:"); + for (int rep = 0; rep < reps; ++rep) { + runOnce(size, newKernel); + printLastProfile(tableHeader); + } + + // Note. You will see from the output that there is a substantial cost to Kernel creation (vs Kernel reuse), almost entirely due to KernelRunner#initJNI + + } + + private static void printLastProfile(String tableHeader) { + KernelProfile profile = KernelManager.instance().getProfile(BlackScholesKernel.class); + KernelDeviceProfile deviceProfile = profile.getLastDeviceProfile(); + String row = deviceProfile.getLastAsTableRow(); + System.out.println(tableHeader); + System.out.println(row); + } + + private static void runOnce(int size, boolean newKernel) { + if (newKernel) { + newBlackScholesKernel(size); + } + kernel.execute(size); + } + + private static void runWarmup() { + int[] rgb = new int[512 * 512]; + Kernel warmupKernel = new Main.MandelKernel(512, 512, rgb); + warmupKernel.execute(512 * 512); + } + + private static void newBlackScholesKernel(int size) { + if (kernel != null) { + kernel.dispose(); + } + System.gc(); + kernel = new BlackScholesKernel(size); + } +} diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java new file mode 100644 index 00000000..2f3252c6 --- /dev/null +++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java @@ -0,0 +1,14 @@ +package com.amd.aparapi.sample.configuration; + +import com.amd.aparapi.internal.kernel.*; + +/** + * Created by Barney on 13/09/2015. + */ +public class ProfilingDemoNoBinaryCaching { + + public static void main(String[] ignored) { + KernelRunner.BINARY_CACHING_DISABLED = true; + ProfilingDemo.main(null); + } +} diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java index fc70267e..597317a6 100644 --- a/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java +++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java @@ -38,15 +38,15 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.convolution; -import java.io.File; +import com.amd.aparapi.*; -import com.amd.aparapi.Kernel; +import java.io.*; public class Convolution { - public static void main(final String[] _args) { + public static void main(final String[] _args) throws IOException { - final File file = new File(_args.length == 1 ? _args[0] : "testcard.jpg"); + final File file = new File(_args.length == 1 ? _args[0] : "./samples/convolution/testcard.jpg").getCanonicalFile(); final ImageConvolution convolution = new ImageConvolution(); diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java index 13de9585..d527917a 100644 --- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java +++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java @@ -60,6 +60,16 @@ import java.util.List; public class Main{ + static { + System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true"); +// KernelManager.setKernelManager(new KernelManager() { +// @Override +// protected List<Device.TYPE> getPreferredDeviceTypes() { +// return Collections.singletonList(Device.TYPE.CPU); +// } +// }); + } + /** * An Aparapi Kernel implementation for creating a scaled view of the mandelbrot set. * @@ -70,13 +80,13 @@ public class Main{ public static class MandelKernel extends Kernel{ /** RGB buffer used to store the Mandelbrot image. This buffer holds (width * height) RGB values. */ - final private int rgb[]; + private int[] rgb; /** Mandelbrot image width. */ - final private int width; + private int width; /** Mandelbrot image height. */ - final private int height; + private int height; /** Maximum iterations for Mandelbrot. */ final private int maxIterations = 64; @@ -112,6 +122,12 @@ public class Main{ } + public void resetImage(int _width, int _height, int[] _rgb) { + width = _width; + height = _height; + rgb = _rgb; + } + public int getCount(float x, float y) { int count = 0; @@ -152,6 +168,9 @@ public class Main{ scale = _scale; } + public int[] getRgbs() { + return rgb; + } } /** User selected zoom-in point on the Mandelbrot view. */ diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java index 8a1b7faa..5bdd9805 100644 --- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java +++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java @@ -143,6 +143,7 @@ public class Main2D{ @SuppressWarnings("serial") public static void main(String[] _args) { + final JFrame frame = new JFrame("MandelBrot"); /** Mandelbrot image height. */ diff --git a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java index f4e3e28c..b94c359d 100644 --- a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java +++ b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java @@ -1,6 +1,5 @@ package com.amd.aparapi.sample.median; -import com.amd.aparapi.device.*; import com.amd.aparapi.internal.kernel.*; import javax.imageio.*; @@ -8,7 +7,6 @@ import javax.swing.*; import java.awt.*; import java.awt.image.*; import java.io.*; -import java.util.*; /** * Demonstrate use of __private namespaces and @NoCL annotations. @@ -27,12 +25,10 @@ public class MedianDemo { } } - private static final boolean TEST_JTP = true; - public static void main(String[] ignored) { final int size = 5; - System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "true"); - boolean verbose = true; + System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true"); + boolean verbose = false; if (verbose) { System.setProperty("com.amd.aparapi.enableVerboseJNI", "true"); @@ -42,18 +38,22 @@ public class MedianDemo { System.setProperty("com.amd.aparapi.enableExecutionModeReporting", "true"); } - if (TEST_JTP) { - LinkedHashSet<Device> devices = new LinkedHashSet<>(Collections.singleton(JavaDevice.THREAD_POOL)); - KernelManager.instance().setDefaultPreferredDevices(devices); - } +// KernelManager.setKernelManager(new KernelManager(){ +// @Override +// protected Comparator<OpenCLDevice> getDefaultGPUComparator() { +// return new Comparator<OpenCLDevice>() { +// @Override +// public int compare(OpenCLDevice o1, OpenCLDevice o2) { +// return o2.getMaxComputeUnits() - o1.getMaxComputeUnits(); +// } +// }; +// } +// }); + + System.out.println(KernelManager.instance().bestDevice()); int[] argbs = testImage.getRGB(0, 0, testImage.getWidth(), testImage.getHeight(), null, 0, testImage.getWidth()); - MedianKernel7x7 kernel = new MedianKernel7x7(); - kernel._imageTypeOrdinal = MedianKernel7x7.RGB; - kernel._sourceWidth = testImage.getWidth(); - kernel._sourceHeight = testImage.getHeight(); - kernel._sourcePixels = argbs; - kernel._destPixels = new int[argbs.length]; + MedianKernel7x7 kernel = createMedianKernel(argbs); kernel.processImages(new MedianSettings(size)); BufferedImage out = new BufferedImage(testImage.getWidth(), testImage.getHeight(), BufferedImage.TYPE_INT_RGB); @@ -71,12 +71,35 @@ public class MedianDemo { frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); frame.setVisible(true); - int reps = 20; + StringBuilder builder = new StringBuilder(); + KernelManager.instance().reportDeviceUsage(builder, true); + System.out.println(builder); + + int reps = 50; + final boolean newKernel = false; for (int rep = 0; rep < reps; ++rep) { + if (newKernel) { + kernel.dispose(); + kernel = createMedianKernel(argbs); + } long start = System.nanoTime(); kernel.processImages(new MedianSettings(size)); long elapsed = System.nanoTime() - start; System.out.println("elapsed = " + elapsed / 1000000f + "ms"); } + + builder = new StringBuilder(); + KernelManager.instance().reportDeviceUsage(builder, true); + System.out.println(builder); + } + + private static MedianKernel7x7 createMedianKernel(int[] argbs) { + MedianKernel7x7 kernel = new MedianKernel7x7(); + kernel._imageTypeOrdinal = MedianKernel7x7.RGB; + kernel._sourceWidth = testImage.getWidth(); + kernel._sourceHeight = testImage.getHeight(); + kernel._sourcePixels = argbs; + kernel._destPixels = new int[argbs.length]; + return kernel; } } -- GitLab