diff --git a/data/tree.yml b/data/tree.yml index 81ad476ee008ec3c656a31d14f0deaf75a89e234..642e73afe361a323537e7fca491f76a7b9ac2c36 100644 --- a/data/tree.yml +++ b/data/tree.yml @@ -3,36 +3,32 @@ index.html.haml: "/index.html.haml" introduction: about.html.md: "/introduction/about.html.md" getting-started.html.md: "/introduction/getting-started.html.md" + faq.html.md: "/introduction/faq.html.md" documentation: - AccessingMultiDimNDRangeProposal.html.md: "/documentation/AccessingMultiDimNDRangeProposal.html.md" - AddingLambdasToAparapi.html.md: "/documentation/AddingLambdasToAparapi.html.md" - AddressSpacesUsingBuffers.html.md: "/documentation/AddressSpacesUsingBuffers.html.md" - AparapiExtensionProposal.html.md: "/documentation/AparapiExtensionProposal.html.md" - AparapiPatterns.html.md: "/documentation/AparapiPatterns.html.md" - BuildingNBody.html.md: "/documentation/BuildingNBody.html.md" - ByteCode2OpenCL.pdf: "/documentation/ByteCode2OpenCL.pdf" - ChoosingSpecificDevicesForExecution.html.md: "/documentation/ChoosingSpecificDevicesForExecution.html.md" - ConvertingBytecodeToOpenCL.html.md: "/documentation/ConvertingBytecodeToOpenCL.html.md" - DeviceProposal.html.md: "/documentation/DeviceProposal.html.md" - EmulatingMultipleEntrypointsUsingCurrentAPI.html.md: "/documentation/EmulatingMultipleEntrypointsUsingCurrentAPI.html.md" - ExplicitBufferHandling.html.md: "/documentation/ExplicitBufferHandling.html.md" - FrequentlyAskedQuestions.html.md: "/documentation/FrequentlyAskedQuestions.html.md" - HSAEnablementOfLambdaBranch.html.md: "/documentation/HSAEnablementOfLambdaBranch.html.md" - HSAEnablementOfLambdaBranchSidebar.html.md: "/documentation/HSAEnablementOfLambdaBranchSidebar.html.md" - JavaKernelGuidelines.html.md: "/documentation/JavaKernelGuidelines.html.md" - LIbraryAgentDuality.html.md: "/documentation/LIbraryAgentDuality.html.md" - MultipleEntryPointSupportProposal.html.md: "/documentation/MultipleEntryPointSupportProposal.html.md" - NewFeatures.html.md: "/documentation/NewFeatures.html.md" - NewOpenCLBinding.html.md: "/documentation/NewOpenCLBinding.html.md" - PossibleAparapiLambdaSyntaxOptions.html.md: "/documentation/PossibleAparapiLambdaSyntaxOptions.html.md" - PrivateMemorySpace.html.md: "/documentation/PrivateMemorySpace.html.md" - ProfilingKernelExecution.html.md: "/documentation/ProfilingKernelExecution.html.md" - ProfilingKernelsFromEclipse.html.md: "/documentation/ProfilingKernelsFromEclipse.html.md" - SettingUpLinuxHSAMachineForAparapi.html.md: "/documentation/SettingUpLinuxHSAMachineForAparapi.html.md" - SettingUpLinuxHSAMachineForAparapiSidebar.html.md: "/documentation/SettingUpLinuxHSAMachineForAparapiSidebar.html.md" - UnitTestGuide.html.md: "/documentation/UnitTestGuide.html.md" - UsingAparapiLambdaBranchWithHSASimulator.html.md: "/documentation/UsingAparapiLambdaBranchWithHSASimulator.html.md" - UsingConstantMemory.html.md: "/documentation/UsingConstantMemory.html.md" - UsingLocalMemory.html.md: "/documentation/UsingLocalMemory.html.md" - UsingMultiDimExecutionRanges.html.md: "/documentation/UsingMultiDimExecutionRanges.html.md" + aparapi-patterns.html.md: "/documentation/aparapi-patterns.html.md" + choosing-specific-devices.html.md: "/documentation/choosing-specific-devices.html.md" + converting-java-to-opencl.html.md: "/documentation/converting-java-to-opencl.html.md" + emulating-multiple-entrypoints.html.md: "/documentation/emulating-multiple-entrypoints.html.md" + explicit-buffer-handling.html.md: "/documentation/explicit-buffer-handling.html.md" + hsa-enabled-lambda.html.md: "/documentation/hsa-enabled-lambda.html.md" + kernel-guidelines.html.md: "/documentation/kernel-guidelines.html.md" + library-agent-duality.html.md: "/documentation/library-agent-duality.html.md" + new-features.html.md: "/documentation/new-features.html.md" + opencl-bindings.html.md: "/documentation/opencl-bindings.html.md" + private-memory-space.html.md: "/documentation/private-memory-space.html.md" + profiling-the-kernel.html.md: "/documentation/profiling-the-kernel.html.md" + setting-up-hsa.html.md: "/documentation/setting-up-hsa.html.md" + unit-tests.html.md: "/documentation/unit-tests.html.md" + using-hsa-simulator.html.md: "/documentation/using-hsa-simulator.html.md" + constant-memory.html.md: "/documentation/constant-memory.html.md" + local-memory.html.md: "/documentation/local-memory.html.md" + multiple-dim-ranges.html.md: "/documentation/multiple-dim-ranges.html.md" +proposals: + multiple-dim-nd-range.html.md: "/proposals/multiple-dim-nd-range.html.md" + lambdas.html.md: "/proposals/lambdas.html.md" + address-space-with-buffers.html.md: "/proposals/address-space-with-buffers.html.md" + extensions.html.md: "/proposals/extensions.html.md" + device.html.md: "/proposals/device.html.md" + multiple-entry-points.html.md: "/proposals/multiple-entry-points.html.md" + lambda-syntax.html.md: "/proposals/lambda-syntax.html.md" showcase.html.haml: "/showcase.html.haml" \ No newline at end of file diff --git a/source/documentation/AccessingMultiDimNDRangeProposal.html.md b/source/documentation/AccessingMultiDimNDRangeProposal.html.md deleted file mode 100644 index 3af9c5832868f68b2065565ec2fe0676b8f005d9..0000000000000000000000000000000000000000 --- a/source/documentation/AccessingMultiDimNDRangeProposal.html.md +++ /dev/null @@ -1,200 +0,0 @@ ---- - title: AccessingMultiDimNDRangeProposal ---- - -*A proposal for accessing multi-dim ND range execution Updated Dec 14, 2011 by frost.g...@gmail.com* - -We can discuss this proposal either here (in comments) or via the discussion list here. - -Note this is nothing to do with accessing Java 2D arrays in Aparapi. This discussion is focused on the ability to expose the execution of kernels over 1, 2 or 3 dimensions. The memory in each case is a single contiguous region (like a single dimension primitive array). - -At present an Aparapi kernel can only be executed using a single dimension. If we wish to represent execution over WIDTH x HEIGHT element grid we would execute over the range (WIDTH*HEIGHT) and manually divide/mod getGlobalID() by WIDTH to determine the x and y for each. - -Similarly we would multiply y by WIDTH and add x (y*WIDTH+x) to convert an X,Y location to a linear global id - - final static int WIDTH=128; - final static int HEIGHT=64; - final int in[] = new int[WIDTH*HEIGHT]; - final int out[] = new int[WIDTH*HEIGHT]; - Kernel kernel = new Kernel(){ - public void run(){ - int x = getGlobaId()%WIDTH; - int y = getGlobalID()/WIDTH; - if (!(x==1 || x==(WIDTH-1) || y==1 || y==(HEIGHT-1)){ - int sum = 0; - for (int dx =-1; dx<2; dx++){ - for (int dy =-1; dy<2; dy++){ - sum+=in[(y+dy)*WIDTH+(x+dx)]; - } - } - out[y*WIDTH+x] = sum/9; - // or out[getGlobalID()] = sum/9; - } - } - - }; - kernel.execute(WIDTH*HEIGHT); - -OpenCL natively allows the user to execute over 1, 2 or 3 dimension grids via the clEnqueueNDRangeKernel() method. - -We chose not to expose this in Aparapi but there have been requests for us to allow it. - -There are a number of things to consider here: - -1. Extending the syntax of kernel.execute() to allow multi dimensional grids. -1. Mapping Kernel methods to OpenCL's get_local_id(int dim), get_local_size(int dim), get_group_id(int_dim), etc. At present we map kernel.getGlobalId() to get_local_id(0). -1. Handling all of these when an application drops back to JTP mode. - -##Extending Kernel.execute(int range) -Sadly we can't overload Kernel.execute(int range), Kernel.execute(int xrange, int yrange) and Kernel.execute(int xrange, int yrange, int zrange) because we already have kernel.execute(int, int) mapped for executing mutiple passes over the linear range. - -Remember - - for (int pass=0; pass<20; pass++){ - kernel(1024); - } -Is equivalent to - - kernel(1024, 20); -I think I would prefer - - Kernel.execute(int range) - Kernel.execute(int range, int passes) - Kernel.executeXY(int xrange, int yrange) - Kernel.executeXY(int xrange, int yrange, int passes) - Kernel.executeXYZ(int xrange, int yrange, int zrange) - Kernel.executeXYZ(int xrange, int yrange, int zrange, int passes) - Obviously in the above calls we are only supplying the global bounds for the grid. We could also provide mappings allowing local ranges. I think I would prefer - - Kernel.executeLocal(int range, int local) - Kernel.executeLocal(int range, int local, int passes) - Kernel.executeXYLocal(int xrange, int yrange, int xlocalrange, int ylocalrange) - Kernel.executeXYLocal(int xrange, int yrange, int xlocalrange, int ylocalrange, int passes) - Kernel.executeXYZLocal(int xrange, int yrange, int zrange, int xlocalrange, int ylocalrange, int zlocalrange) - Kernel.executeXYZLocal(int xrange, int yrange, int zrange, int xlocalrange, int ylocalrange, int zlocalrange, int passes) -Another alternative may be to create Range classes - - class Range{ - int passes; - int width; - static Range create(int width); - static Range create(int width, int passes); - } - - class Range2D extends Range{ - int height; - static Range create(int width, int height); - static Range create(int width, int height, int passes); - - } - - class Range3D extends Range2D{ - int depth; - static Range create(int width, int height); - static Range create(int width, int height, int passes); - } -With appropriate constructors (or factory methods) to allow - - Kernel.execute(Range range) - -Then execute would be simply. - - Kernel.execute(Range.create(1,1)) - -We can also arrange for the group size to be placed in the base Range class. - - class Range{ - int groupSize; - int passes; - int width; - static Range create(int width); - static Range create(int width, int passes); - } - -##Mapping to OpenCL multi dim methods. i.e get_global_id(1), get_local_size(2) etc -We could just add getGlobalId(int dim), getLocalSize(int dim) etc to replicate OpenCL methods. - -I would prefer to offer the following global mappings - -|Kernel | OpenCL| -|-----|------| -|getGlobalId()| get_global_id(0)| -|getGlobalX()| get_global_id(0)| -|getGlobalY()| get_global_id(1)| -|getGlobalZ()| get_global_id(2)| -|getGlobalSize()| get_global_size(0)| -|getGlobalWidth()| get_global_size(0)| -|getGlobalHeight()| get_global_size(1)| -|getGlobalDepth()| get_global_size(2)| - -And the following local mappings - -|Kernel| OpenCL| -|-----|-------| -|getLocalId()| get_local_id(0)| -|getLocalX()| get_local_id(0)| -|getLocalY()| get_local_id(1)| -|getLocalZ()| get_local_id(2)| -|getLocalSize()| get_local_size(0)| -|getLocalWidth()| get_local_size(0)| -|getLocalHeight()| get_local_size(1)| -|getLocalDepth()| get_local_size(2)| - -##An example - - final static int WIDTH=128; - final static int HEIGHT=64; - final int in[] = new int[WIDTH*HEIGHT]; - final int out[] = new int[WIDTH*HEIGHT]; - Kernel kernel = new Kernel(){ - public void run(){ - int x = getGlobalX(); - int y = getGlobalY(); - if (!(x==1 || x==(getGlobalWidth()-1) || y==1 || y==(getGlobalHeight()-1)){ - int sum = 0; - for (int dx =-1; dx<2; dx++){ - for (int dy =-1; dy<2; dy++){ - sum+=in[(y+dy)*getGlobalWidth()+(x+dx)]; - } - } - out[y*getGlobalWidth()+x] = sum/9; - // or out[getGlobalID()] = sum/9; - } - } - - }; - kernel.executeXY(WIDTH, HEIGHT); - -Or if we choose the Range class approach. - - final static int WIDTH=128; - final static int HEIGHT=64; - final int in[] = new int[WIDTH*HEIGHT]; - final int out[] = new int[WIDTH*HEIGHT]; - Kernel kernel = new Kernel(){ - public void run(){ - int x = getGlobalX(); - int y = getGlobalY(); - if (!(x==1 || x==(getGlobalWidth()-1) || y==1 || y==(getGlobalHeight()-1)){ - int sum = 0; - for (int dx =-1; dx<2; dx++){ - for (int dy =-1; dy<2; dy++){ - sum+=in[(y+dy)*getGlobalWidth()+(x+dx)]; - } - } - out[y*getGlobalWidth()+x] = sum/9; - // or out[getGlobalID()] = sum/9; - } - } - - }; - kernel.execute(Range2D.create(WIDTH, HEIGHT)); - -##Handling this from JTP mode -Mapping to OpenCL for this is all fairly straightforward. - -In Java JTP mode we will have to emulate this. For get_global_id(0..3) (getGlobalX(), getGlobalY() and getGlobalZ() using our proposed Aparapi Java mappings) we can of course easily offer reasonable implementations, this just requires the Java code to essentially nest 3 loops (or emulate) and set globalX, globalY, globalZ inside each nesting. - -For get_local_size(0..3) (getLocalWidth(), getLocalHeight() and getLocalDepth() using our proposed Aparapi Java mappings) we will need to break the globalWidth/globalHeight and globalDepth into some arbitrary equal 'chunks' (note I am avoiding using the word groups here to avoid confusion with get_group_size(0..3)! - -At present we always create a synthetic group in JTP mode which is the the # or cores. This will need to be changed. If the user requests a grid (64,64,8,8) (global width 64, global height 64, local width 8, local height 8) then we will have to create a JTP group of 64 (8x8) and just in case the kernel code contains a barrier, we will need to ensure we launch 64 threads for this group. From our experience it is best to launch one thread per core, so we may lose some JTP performance executing in this mode. \ No newline at end of file diff --git a/source/documentation/AparapiExtensionProposal.html.md b/source/documentation/AparapiExtensionProposal.html.md deleted file mode 100644 index 0fe22d6eedd86a82efb4b4887491ec43ae974c00..0000000000000000000000000000000000000000 --- a/source/documentation/AparapiExtensionProposal.html.md +++ /dev/null @@ -1,261 +0,0 @@ ---- - title: AparapiExtensionProposal ---- - -*A proposed aparapi extension mechanism. Updated Feb 29, 2012 by frost.g...@gmail.com* - -##Here is a proposed Aparapi extension mechanism -This would allow a developer to create a library that could be used by Aparapi Kernel code. The library would include OpenCL and Java implementations. - -We will treat this as a live document. Please join the discussions at http://groups.google.com/group/aparapi-discuss/browse_thread/thread/7ec81ecb2169aa4 and I will update this page to reflect what I think the latest decisions are:- - -Currently Aparapi allows Java bytecode to be converted to OpenCL at runtime. Only the OpenCL generated by this conversion process is made available. Sometimes for performance reasons we might want to allow hand coded OpenCL to be called from Aparapi kernel code. - -Here we will present a strawman API which would allow extension points to be added by an end user or by a library provider. - -We will use an FFT usecase to walk through the steps. - -The FFT (Fast Fourier Transform) algorithm can be coded in Aparapi, but for performance reasons handcrafted OpenCL is likely to be more performant. The goal is to allow Aparapi to do what it does best, i.e. manage the host buffer allocations and provide a mechanism for binding arbitrary opencl code at runtime. - -So lets assume we wanted an Aparapi Kernel to be able to call an Aparapi extension for computing FFT (forward and reverse). The Kernel implementation might look like this. - - public static class BandStopFilter extends Kernel{ - FFT fft = new FFT(); // Create an instance of the Extension point. - float[] real; - float[] imaginary; - - BandStopFilter (float[] _real){ - real = _real; - imaginary = new float[_real.length]; - - } - - @Override public void run() { - fft.forward(real, imaginary); - } - } - -The main method then would just execute the Kernel using the familiar kernel.execute() method :- - - public static void main(String[] args) { - float[] data = new float[1024]; - BandStopFilter kernel = new BandStopFilter (data); - kernel.execute(data.length); - } - -Essentially we want the FFT.forward(float[] _real, float[] _imaginary) and FFT.reverse(float[] _real, float[] _imaginary) methods to be callable from Aparapi Kernel code. We want Aparapi to handle the call-forwarding and the argument/buffer mapping transfers. We want Aparapi to call the Java methods normally if OpenCL is not available but would like Aparapi to use the implementor provided OpenCL if it is. So the implementor will be required to provide both a Java and an OpenCL version of the callable methods because Aparapi will decide which version needs to be called ant runtime. - -Any extension point is required to implement the AparapiExtensionPoint interface. - -public class AparapiExtensionPoint - public String getOpenCL(); -} -Here is a possible (although incomplete) FFT implementation. - - public class FFT implements AparapiExtensionPoint{ - @AparapiCallable public void forward( - @Global @ReadWrite float[] _data, - @Global @ReadWrite float[] _imaginary) { - // java implementation - } - - @AparapiCallable public void reverse( - @Global @ReadWrite float[] _data, - @Global @ReadWrite float[] _imaginary) { - // java implementation - } - - @Override public String getOpenCL() { - return "" - +"void my_package_FFT_forward(" - +" __global float* _real," - +" __global float* _imaginary )" - +" {" - +" // OpenCL implemention" - +" }" - +"void my_package_FFT_reverse(" - +" __global float* _real," - +" __global float* _imaginary )" - +" {" - +" // OpenCL implemention" - +" }"; - } - } - -The implementer’s class will be required to define the callable aparapi methods as well as implement the `getOpenCL()` method so that the OpenCL implementation of those methods can be extracted at run-time. - -Aparapi will provide annotations to decorate the methods and args/parameters of the exposed callable methods . These annotations provide information so that Aparapi locate the callable methods as well as parameter hints to help coordinate buffer types (global, local, constant) and transfer directions (read,write, readWrite) when executing the methods from a Kernel. This information is consulted during the normal bytecode analysis that Aparapi provides when Aparapi hits the call site. - -Note that the Java code inside the `@AparapiCallable` functions (or code executed from it) is not constrained to the normal Aparapi subset. It can be any legitimate Java code, but should be thread safe (because it will be called from JTP mode!). - -Note also that the OpenCL code yielded from the `getOpenCL()` method is assumed to be complete, Aparapi does not attempt to parse this code. If the code fails to compile Aparapi will fallback and execute the whole Kernel in JTP mode. - -BTW we show getOpenCL() returning a String literal. This is most likely to be how code is returned. However, it could be extracted from a File? a resource in the Jar file? or dynamically generated based on some state. For example an FFT implementation might choose to use different code for radix2 or radix4 implementations (based on a paramater passed to `FFT()` constructor - say `FFT(FFT.RADIX2))` in which case the getOpenCL() method might yield different code. - -The above proposal covers the case where a third party might want to provide an Aparapi extension point as a library. - -We might also consider allowing single methods within the Kernel to be optimized, where the OpenCL is made available via the AparapiCallable annotation. The method would still use the same Annotations for the args (to allow buffer txfers to be optimized). - - Kernel k = new Kernel(){ - @AparapiCallable(†/* opencl code for sum() goes here */â€) - int sum(@Global @ReadWrite int[] data, int length){ - int sum = 0; - for (int v:data){ - sum+=v; - } - } - @Override public void run(){ - sum(data); - } - } - -Here are the proposed new interfaces/annotations - - public interface AparapiExtensionPoint { - public String getOpenCL(); - } - @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.METHOD) - public @interface AparapiCallable { - String value default NULL; - } - - @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) - public @interface Global {} - - @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) - public @interface Local {} - - @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) - public @interface Constant {} - - @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) - public @interface ReadWrite {} - - @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) - public @interface ReadOnly {} - - @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) - public @interface WriteOnly {} - -And here is the example code in one chunk - - public class FFT implements AparapiExtensionPoint{ - @AparapiCallable public void forward( - @Global @ReadWrite float[] _data, - @Global @ReadWrite float[] _imaginary) { - // java implementation - } - - @AparapiCallable public void reverse( - @Global @ReadWrite float[] _data, - @Global @ReadWrite float[] _imaginary) { - // java implementation - } - - @Override public String getOpenCL() { - return "" - +"void my_package_FFT_forward(" - +" __global float* _real," - +" __global float* _imaginary )" - +" {" - +" // OpenCL implemention" - +" }" - +"void my_package_FFT_reverse(" - +" __global float* _real," - +" __global float* _imaginary )" - +" {" - +" // OpenCL implemention" - +" }"; - } - } - - public class BandStopFilter extends Kernel{ - FFT fft = new FFT(); - float[] real; - float[] imaginary; - - BandStopFilter (float[] _real){ - real = _real; - imaginary = new float[_real.length]; - - } - - @Override public void run() { - fft.forward(real, imaginary); - } - } - - public static void main(String[] args) { - float[] data = new float[1024]; - BandStopFilter kernel = new BandStopFilter (data); - kernel.execute(data.length); - } - -After discussion I think we are converging on a less complex solution. This is based on Witold's feedback suggestion (see below) where we use OpenCL annotations rather than forcing the implementation of the interface and the `getOpenCL()` method as originally suggested. - -So we will create an `@OpenCL` annotation for classes/methods. - -The `@OpenCL` annotation on the methods will contain the OpenCL source replacement for a specific method. The arg list will be created by Aparapi. - -The @OpenCL annotation on a class allows us to optionally introduce common code (helper methods, #pragmas, constants) which will precede the method declarations in the OpenCL code. - -So an FFT example whereby forward() and reverse() methods both called a common foo() method might look like this. - - @OpenCL(common="/* common void foo(){} + maybe #pragmas + accessable - global fields declared here */") - public class FFT extends AparapiExtensionPoint { - @OpenCL(signature="//function signature - OPTIONAL", body="{ /* uses foo(); */ }") - public void forward( - @Global @ReadWrite float[] _data, - @Global @ReadWrite float[] _imaginary) { - // java implementation - } - @OpenCL(function="{ /*uses foo(); */) }") - public void reverse( - @Global @ReadWrite float[] _data, - @Global @ReadWrite float[] _imaginary) { - // java implementation - } - } - } - -To invoke from an Aparapi kernel. We should be able to do something like - - public class BandStopFilter extends Kernel{ - FFT fft = new FFT(); - float[] real; - float[] imaginary; - - BandStopFilter (float[] _real){ - real = _real; - imaginary = new float[_real.length]; - - } - - @Override public void run() { - fft.forward(this, real, imaginary); - } - } - - public static void main(String[] args) { - float[] data = new float[1024]; - BandStopFilter kernel = new BandStopFilter (data); - kernel.execute(data.length); - } - -Ideally we would also like to invoke FFT directly (instead of via a Kernel). This is tricky because the forward()} and reverse() methods will need to be invoked across a range and of course the dispatch across the range needs to be initiated from Aparapi. - -The only way I can see how to do this is to force the creation of an interface so we can use Java's existing Proxy mechanism to create a wrapper. - - @OpenCL(wraps=FFT.class); - interface FFTInterface{ - public void forward( Range _range, float[] _data, float[] _imaginary); - public void reverse( Range _range, float[] _data, float[] _imaginary); - } - Then provide a mechanism for extracting a proxy and invoking it. - - float[] real = //?? - float[] imag = //?? - Aparapi.wrap<FFT>(FFTInterface.class).forward(range, real, imag); - -I can't see a cleaner solution. diff --git a/source/documentation/AparapiPatterns.html.md b/source/documentation/AparapiPatterns.html.md deleted file mode 100644 index 8147db8f3384e24da12c2bf4ad31bf84d4e906d3..0000000000000000000000000000000000000000 --- a/source/documentation/AparapiPatterns.html.md +++ /dev/null @@ -1,132 +0,0 @@ ---- - title: AparapiPatterns ---- - -*Examples and code fragments to demonstrate Aparapi fetaures. Updated Jul 24, 2012 by frost.g...@gmail.com* - -##Aparapi Patterns - -The following suggestions help solve some common problems found in using Aparapi. - -Additional suggestions and solutions to extend this list would be welcome. - -##How do I return data from a kernel if I can’t write to kernel fields? - -Use a small array buffer (possibly containing a single element) and assign it from the kernel. - -For example, the following kernel code detects whether the buffer[] contains the value 1234. The flag (true or false) is returned in found[0]. - - final int buffer[] = new int[HUGE]; - final boolean found[] = new boolean[]{false}; - // fill buffer somehow - kernel kernel = new kernel(){ - @Override public void run(){ - if (buffer[getGlobald()]==1234){ - found[0]=true; - } - } - }; - kernel.execute(buffer.length); - -This code does include a race condition, whereby more than one value of `Kernel.getGlobalId()` might contain 1234 and try to set `found[0]`. This is not a problem here, because we don't care if multiple kernel executions match, provided one flips the value of `found[0]`. - -##How can I use Aparapi and still maintain an object-oriented view of my data? - -See the NewFeatures page. Aparapi can now handle simple arrays of objects, which minimizes the amount of refactoring required to experiment with Aparapi. However, performance is still likely to be better if your algorithm operates on data held in parallel primitive arrays. To get higher performance from Aparapi with minimal exposure to data in this parallel primitive array form, we can (with a little work) allow both forms of data to co-exist. Let’s reconsider the NBody problem (http://en.wikipedia.org/wiki/N-body_problem) . - -A Java developer writing an NBody solution would most likely create a Body class: - - class Body{ - float x,y,z; - float getX(){return x;} - void setX(float _x){ x = _x;} - float getY(){return y;} - void setY(float _y){ y = _y;} - float getZ(){return z;} - void setZ(float _z){ z = _z;} - - - // other data related to Body unused by positioning calculations - } - -The developer would also likely create a container class (such as NBodyUniverse), that manages the positions of multiple Body instances. - - class NBodyUniverse{ - final Body[] bodies = null; - NBodyUniverse(final Bodies _bodies[]){ - bodies = _bodies; - for (int i=0; i<bodies.length; i++){ - bodies[i].setX(Math.random()*100); - bodies[i].setY(Math.random()*100); - bodies[i].setZ(Math.random()*100); - } - } - void adjustPositions(){ - // can use new array of object Aparapi features, but is not performant - } - } - Body bodies = new Body[BODIES]; - for (int i=0; i<bodies; i++){ - bodies[i] = new Body(); - } - NBodyUniverse universe = new NBodyUniverse(bodies); - while (true){ - universe.adjustPositions(); - // display NBodyUniverse - } - -The NBodyUniverse.adjustPostions() method contains the nested loops (adjusting each body position based on forces impinging on it from all of the other bodies), making it an ideal Aparapi candidate. - -Even though this code can now be written by accessing the x, y and z ordinates of Body[] via getters/setters, the most performant Aparapi implementation is the one that operates on parallel arrays of floats containing x, y and z ordinates, with Body[10]’s state conceptually stored across x[10], y[10] and z[10]. - -So for performance reasons, you can do something like this: - - class Body{ - int idx; - NBodyUniverse universe; - void setUniverseAndIndex(NBodyUniverse _universe, int _idx){ - universe = _universe; - idx = _idx; - } - - // other fields not used by layout - - void setX(float _x){ layout.x[idx]=_x;} - void setY(float _y){ layout.y[idx]=_y;} - void setZ(float _z){ layout.z[idx]=_z;} - float getX(){ return layout.x[idx];} - float getY(){ return layout.y[idx];} - float getZ(){ return layout.z[idx];} - } - class NBodyUniverse { - final Body[] bodies; - final int[] x, y, z; - NBodyUniverse(Body[] _bodies){ - bodies = _bodies; - for (int i=0; i<bodies.length; i++){ - bodies[i].setUniverseAndIndex(this, i); - bodies[i].setX(Math.random()*100); - bodies[i].setY(Math.random()*100); - bodies[i].setZ(Math.random()*100); - } - } - void adjustPositions(){ - // can now more efficiently use Aparapi - } - } - - - - Body bodies = new Body[BODIES]; - for (int i=0; i<bodies; i++){ - bodies[i] = new Body(); - } - NBodyUniverse universe = new NBodyUniverse(bodies); - while (true){ - universe.adjustPositions(); - // display NBodyUniverse - } - -This example allows Javaâ„¢ code to treat each Body in a traditional object-oriented fashion and also allows Aparapi kernels to act on the parallel primitive array form, in order to access/mutate the position of the bodies. - -[Attribution](Attribution.md) \ No newline at end of file diff --git a/source/documentation/BuildingNBody.html.md b/source/documentation/BuildingNBody.html.md deleted file mode 100644 index a669b999861d24d3c984a64b3a7d353508855f40..0000000000000000000000000000000000000000 --- a/source/documentation/BuildingNBody.html.md +++ /dev/null @@ -1,43 +0,0 @@ ---- - title: BuildingNBody ---- - -*How to build the NBody example. Updated Nov 11, 2011 by frost.g...@gmail.com* -##Building NBody -The NBody example is located in the/ examples subdirectory under the Aparapi trunk: - - trunk/ - ... - examples/ - ... - nbody/ - src/java/com.aparapi.nbody/ - build.xml - nbody.sh - nbody.bat -The NBody example requires a little more work to build because it depends on a third party project named ‘JOGL’. - -JOGL is a set of OpenGLâ„¢ bindings for Java® and the NBody example uses this library to render the particles/bodies (potentially many thousands of them) at runtime. More information about JOGL can be found here http://jogamp.org/jogl/www. - -The build.xml file build target will download the jars required to build and run the nbody example if the files do not exist. - -To build nbody, perform the following commands. - - C:> ant clean build -The NBody build.xml file includes a ‘run’ target so you can launch the application using. - - C:> ant run -Or if you prefer to launch from either the nbody.sh or nbody.bat script. - -For Linux® we also need to chmod nbody.sh in order to execute it. - - chmod +x nbody.sh -The nbody scripts take the execution mode as the first argument, the number of bodies as the second argument, and then the height and width (in pixels). - -Windows example: - - C:> nbody GPU 32768 800 800 -Linux example: - - $ ./nbody.sh GPU 32768 800 800 -Attribution \ No newline at end of file diff --git a/source/documentation/ChoosingSpecificDevicesForExecution.html.md b/source/documentation/ChoosingSpecificDevicesForExecution.html.md deleted file mode 100644 index 11d71fddb99e3ec364f7a8c23c03327bcd1b905d..0000000000000000000000000000000000000000 --- a/source/documentation/ChoosingSpecificDevicesForExecution.html.md +++ /dev/null @@ -1,61 +0,0 @@ ---- - title: ChoosingSpecificDevicesForExecution ---- - -*Using the new Device API's to choose Kernel execution on a specific device. Updated Sep 18, 2012 by frost.g...@gmail.com* - -Previously Aparapi chose the first GPU device when Kernel.execute() was called. This make it easy to execute simple Kernels, but was problematic when users wished finer control over which device should be chosen. Especially when the first device may be unsuitable. We recently added new classes and API's to allow the developer to specify exactly which device we intend to target. - -A new Device class has been added. This allows the user to select a specific device; either by calling a helper method Device.firstGPU() or Device.best(). Or by allowing the user to iterate through all devices and choose one based on some other criteria (capabilities? vendor name?). - -So selecting the 'best' (most performant) device could be achieved using. - - Device device = Device.best(); - -Alternatively if I wanted the first AMD GPU device I might use:- - - Device chosen=null; - for (Device device: devices.getAll()){ - if (device.getVendor().contains("AMD") && device.isGPU()){ - chosen = device; - break; - } - } - -A Device can be queried `(isGPU(), isOpenCL(), isGroup(), isJava(), getOpenCLPlatform(), getMaxMemory(), getLocalSizes())` to yield it's characteristics. - -To execute on a specific device we must use the device to create our range. - - Range range = device.createRange2D(width, height); - -This allows the Range to be created with knowledge of the underlying device. So for example device.createRange3D(1024, 1024, 1024, 16, 16, 16) will fail if the device does not allow a local size of (16x16x16). - -A range created using a device method captures the device which created it. The range instance has a device field which is set by the device which creates it. - -It's as if we had this code - - Range range = Range.create(width, height); - range.setDevice(device); - -So the Range locks the device that it can be used with. - -Now when we have a Kernel. - - Kernel kernel = new Kernel(){ - @Override public void run(){ - ... - } - } - -And we then use a device created range. - - Device device = Device.firstGPU(); - Kernel kernel = new Kernel(){ - @Override public void run(){ - // uses input[]; - } - }; - range = device.createRange2D(1024, 1024); - kernel.execute(range); - -We have forced execution on the first GPU. diff --git a/source/documentation/ConvertingBytecodeToOpenCL.html.md b/source/documentation/ConvertingBytecodeToOpenCL.html.md deleted file mode 100644 index 8ee63904c7e2edef648fc69af84377a8681d68ef..0000000000000000000000000000000000000000 --- a/source/documentation/ConvertingBytecodeToOpenCL.html.md +++ /dev/null @@ -1,285 +0,0 @@ ---- - title: ConvertingBytecodeToOpenCL ---- - - -*How Aparapi converts bytecode to OpenCL Updated Aug 23, 2012 by frost.g...@gmail.com* - -##Introduction - -[try this](ByteCode2OpenCL.pdf) - -One of the unique Aparapi features is it's ability to convert Java bytecode to OpenCL automatically. - -In this page we will try to describe the process used to perform this conversion. If you are unfamiliar with bytecode consider visiting this page WhatIsBytecode. - -The command - - javac Source.java - -Will compile the java source file Source.java to Source.class - -The classfile format is well documented here and we will not go into too much detail here, however it should be known that Aparapi must parse the classfile of each Kernel to extract the bytecode for the Kernel.run() and any method reachable from Kernel.run(). - -Lets start with a simple Kernel. - - import com.aparapi.Kernel; - - public class Squarer extends Kernel{ - int[] in; - int[] out; - @Override public void run(){ - int gid = getGlobalId(0); - out[gid] = in[gid] * in[gid]; - } - } - -We will compile this - - javac -g -cp path/to/aparapi/aparapi.jar Squarer.java - -and then we can look at the bytecode using javap - - javap -c -classpath path/to/aparapi/aparapi.jar;. Squarer - -Compiled from "Squarer.java" - - public class Squarer extends com.aparapi.Kernel - SourceFile: "Squarer.java" - minor version: 0 - major version: 50 - Constant pool: - const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V - const #2 = Method #5.#18; // Squarer.getGlobalId:(I)I - const #3 = Field #5.#19; // Squarer.out:[I - const #4 = Field #5.#20; // Squarer.in:[I - const #5 = class #21; // Squarer - const #6 = class #22; // com/amd/aparapi/Kernel - const #7 = Asciz in; - const #8 = Asciz [I; - const #9 = Asciz out; - const #10 = Asciz <init>; - const #11 = Asciz ()V; - const #12 = Asciz Code; - const #13 = Asciz LineNumberTable; - const #14 = Asciz run; - const #15 = Asciz SourceFile; - const #16 = Asciz Squarer.java; - const #17 = NameAndType #10:#11;// "<init>":()V - const #18 = NameAndType #23:#24;// getGlobalId:(I)I - const #19 = NameAndType #9:#8;// out:[I - const #20 = NameAndType #7:#8;// in:[I - const #21 = Asciz Squarer; - const #22 = Asciz com/amd/aparapi/Kernel; - const #23 = Asciz getGlobalId; - const #24 = Asciz (I)I; - - { - int[] in; - - int[] out; - - public Squarer(); - Code: - Stack=1, Locals=1, Args_size=1 - 0: aload_0 - 1: invokespecial #1; //Method com/amd/aparapi/Kernel."<init>":()V - 4: return - - - public void run(); - Code: - Stack=5, Locals=2, Args_size=1 - 0: aload_0 - 1: iconst_0 - 2: invokevirtual #2; //Method getGlobalId:(I)I - 5: istore_1 - 6: aload_0 - 7: getfield #3; //Field out:[I - 10: iload_1 - 11: aload_0 - 12: getfield #4; //Field in:[I - 15: iload_1 - 16: iaload - 17: aload_0 - 18: getfield #4; //Field in:[I - 21: iload_1 - 22: iaload - 23: imul - 24: iastore - 25: return - } - -Here we see constant pool of the class and the disassembled bytecode of the default constructor Squarer() and the Squarer.run() method. - -The constant pool is a table of constant values that can be accessed from the bytecode of any methods from within this class. Some of the constants are String literals defined within the source (or literals used to name classes, fields, methods, variables or signatures), other slots represent Classes, Methods, Fields or Type signatures. These later constant pool entries cross-reference other constant pool entries to describe higher level artifact. - -For example constant pool entry #1 is - - const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V - -So entry #1 defines a method. The class containing the method is defined in constant pool entry #6. So lets look at constant pool entry #6. - - const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V - - const #6 = class #22; // com/amd/aparapi/Kernel - -At constant pool entry #6 we find a class definition which refers to entry #22 - - const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V - - const #6 = class #22; // com/amd/aparapi/Kernel - - const #22 = Asciz com/amd/aparapi/Kernel; - -Which just contains the String (Ascii) name of the class. - -Looking back at entry #1 again, we note that the Method also references entry #17 which contains a NameAndType entry for determining the method name and the signature. - - const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V - - const #6 = class #22; // com/amd/aparapi/Kernel - - - const #17 = NameAndType #10:#11;// "<init>":()V - - const #22 = Asciz com/amd/aparapi/Kernel; - -Entry #17's "NameAndType" references #10 for the method name. - - const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V - - const #6 = class #22; // com/amd/aparapi/Kernel - - const #10 = Asciz <init>; - - const #17 = NameAndType #10:#11;// "<init>":()V - - const #22 = Asciz com/amd/aparapi/Kernel; - -And then references #11 to get the signature. - - const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V - - const #6 = class #22; // com/amd/aparapi/Kernel - - const #10 = Asciz <init>; - - const #11 = Asciz ()V; - - const #17 = NameAndType #10:#11;// "<init>":()V - - const #22 = Asciz com/amd/aparapi/Kernel; - -So from constant pool #1 we ended up using slots 1,6,10,11,17 and 22 to fully resolve the method. - -This looks like a lot of work, however by breaking method and field references up like this, allows the various slots to be reused by other field/method descriptions. - -So when we see disassembled bytecode which references a constantpool slot the actual slot # (2 in the example below) will appear after the bytecode for invokevirtual. - - 2: invokevirtual #2; Method getGlobalId:(I)I - -Bytecode is basically able to access three things - -1. Constant pool entries -2. Variable slots -3. Stack operands - -Instructions are able to pop operands from the stack, push operands to the stack, load values from variable slots (to the stack), store values (from the stack) to variable slots, store values from accessed fields (to the stack) and call methods (popping args from the stack). - -Some instructions can only handle specific types (int, float, double, and object instances - arrays are special forms of objects) and usually the first character of the instruction helps determine which type the instruction acts upon. So imul would be a multiply instruction that operates on integers, fmul would multiply two floats, dmul for doubles. Instructions that begin with 'a' operate on object instances. - -So lets look at the first instruction. - - 0: aload_0 - -This instruction loads an object (a is the first character) from variable slot 0 (we'll come back to the variable slots in a moment) and pushes it on the stack. - -Variables are held in 'slots' that are reserved at compiled time. - -Consider this static method. - - static int squareMe(int value){ - value += value; - return(value); - } - -This method requires one variable slot. At any one time there is only one variable that is live, it just happens to be an argument to the method. - -The following method also contains one slot. - - static int squareMe(){ - int value=4; - value += value; - return(value); - } - -Here we need two slots - - static int squareMe(int arg){ - int value=arg*arg; - return(value); - } - -Suprisingly the following also only requires two slots. - - static int squareMe(int arg){ - { - int temp = arg*arg; - } - int value=arg*arg; - return(value); - } - -Note that in the above example the temp variable loses scope before the local variable value is used. So only two slots are required. Both temp and value can share a slot. - -If we have an instance method we always require one extra slot (always slot 0) for the this reference. - -So - - int squareMe(int arg){ - int value=arg*arg; - return(value); - } - -Requires three slots. - -Anyway back to our bytecode - - 0: aload_0 - -This loads the object instance in slot 0 (this) and pushes it on the stack. - -Next we have - - 1: iconst_0 - -Which pushes the int constant 0 on the stack. So the stack contains {this,0} - -Next we have - - 2: invokevirtual #2; //Method getGlobalId:(I)I - -This is the bytecode for calling a method. Basically the instruction itself references the constant pool (we'll come back to this ;) ) and pulls the method description in `constantPool2` which happens to be the description for a method called `getGlobalId()` which takes an integer and returns an `int`. - -So the VM will pop the top value `(int - const 0)` as the method arg, and then will pop an object reference (this!) and will call the method `this.getGlobalId(0)` and will push the result (an int) back on the stack. - -So our stack which contains `{this,0}` now contains the result of this.getGlobalId(0), lets assume it is {0}. We describe this invoke instruction as consuming two operands from the stack and producing one. - -Before we start executing our stack is empty {}, the slots are initialized with 'this' (if an instance method) and any arguments passed to the method. - - 0 1 - slots=[this, ? ] stack={} - - 0 1 - 0: aload_0 slots=[this, ? ] stack={this} - 0 1 - 1: iconst_0 slots=[this, ? ] stack={this, 0} - 0 1 - 2: invokevirtual #2; Method getGlobalId:(I)I slots=[this, ? ] stack={result of this.getGlobalId(0) lets say 0} - - 5: istore_1 slots=[this, 0 ] stack={} - - 6: aload_0 slots=[this, 0 ] stack={this} - - 7: getfield #3; //Field out:[I diff --git a/source/documentation/EmulatingMultipleEntrypointsUsingCurrentAPI.html.md b/source/documentation/EmulatingMultipleEntrypointsUsingCurrentAPI.html.md deleted file mode 100644 index f8fa157574922b027427055c1bd70fda4bb68beb..0000000000000000000000000000000000000000 --- a/source/documentation/EmulatingMultipleEntrypointsUsingCurrentAPI.html.md +++ /dev/null @@ -1,229 +0,0 @@ ---- - title: EmulatingMultipleEntrypointsUsingCurrentAPI ---- - -*How to emulate multiple entrypoints using existing Aparapi APIs Updated Jul 30, 2012 by frost.g...@gmail.com* - -##Emulating Multiple Entrypoints Using Existing Aparapi APIs - -Until we have support for multiple entrypoints in Aparapi, there are some tricks for emulating this feature. - -Follow the proposal for adding multiple entrypoints on this page [MultipleEntryPointSupportProposal](MultipleEntryPointSupportProposal.md). - -Suppose we wanted to create a general VectorMath kernel which might expose unary square, squareroot methods and binary addition and subtraction functionality. With our current API limitations we can't easily do this, we can approximate having separate methods by passing a separate arg to dictate the 'function' that we wish to perform. - - class VectorKernel extends Kernel{ - float[] lhsOperand; - float[] rhsOperand; - float[] unaryOperand; - float[] result; - final static int FUNC_ADD =0; - final static int FUNC_SUB =1; - final static int FUNC_SQR =2; - final static int FUNC_SQRT =3; - // other functions - int function; - @Override public void run(){ - int gid = getGlobalId(0){ - if (function==FUNC_ADD){ - result[gid]=lhsOperand[gid]+rhsOperand[gid]; - }else if (function==FUNC_SUB){ - result[gid]=lhsOperand[gid]-rhsOperand[gid]; - }else if (function==FUNC_SQR){ - result[gid]=unaryOperand[gid]*unaryOperand[gid]; - }else if (function==FUNC_ADD){ - result[gid]=sqrt(unaryOperand[gid]); - }else if .... - } - } - -To use this for adding two vectors and then take the sqrt of the result we would use something like.... - - int SIZE=1024; - Range range = Range.create(SIZE); - VectorKernel vk = new VectorKernel(); - vk.lhsOperand = new float[SIZE]; - vk.rhsOperand = new float[SIZE]; - vk.unaryOperand = new float[SIZE]; - vk.result = new float[SIZE]; - - // fill lhsOperand ommitted - // fill rhsOperand ommitted - vk.function = VectorKernel.FUNC_ADD; - vk.execute(range); - System.arrayCopy(vk.result, 0, vk.unaryOperand, 0, SIZE); - vk.function = VectorKernel.FUNC_SQRT; - vk.execute(range); - -This approach is fairly common and I have used it successfully to perform various pipeline stages for calculating FFT's for example. Whilst this is functional it is not a great solution. First the API is clumsy. We have to mutate the state of the kernel instance and then re-arrange the arrays manually to chain math operations. We could of course hide all of this behind helper methods. One could imagine for example an implementation which exposes helper add(lhs, rhs)}}, or {{{sqrt() which hid all the nasty stuff. - - class VectorKernel extends Kernel{ - float[] lhsOperand; - float[] rhsOperand; - float[] unaryOperand; - float[] result; - final static int FUNC_ADD =0; - final static int FUNC_SUB =1; - final static int FUNC_SQR =2; - final static int FUNC_SQRT =3; - // other functions - int function; - @Override public void run(){ - int gid = getGlobalId(0){ - if (function==FUNC_ADD){ - result[gid]=lhsOperand[gid]+rhsOperand[gid]; - }else if (function==FUNC_SUB){ - result[gid]=lhsOperand[gid]-rhsOperand[gid]; - }else if (function==FUNC_SQR){ - result[gid]=unaryOperand[gid]*unaryOperand[gid]; - }else if (function==FUNC_ADD){ - result[gid]=sqrt(unaryOperand[gid]); - }else if .... - } - private void binary(int operator, float[] lhs, float[] rhs){ - lhsOperand = lhs; - rhsOperand = rhs; - function=operator; - execute(lhs.length()); - } - public void add(float[] lhs, float[] rhs){ - binary(FUNC_ADD, lhs, rhs); - } - - public void sub(float[] lhs, float[] rhs){ - binary(FUNC_SUB, lhs, rhs); - } - - private void binary(int operator, float[] rhs){ - System.arrayCopy(result, 0, lhsOperand, result.length); - rhsOperand = rhs; - function=operator; - execute(lhsOperand.legth()); - } - - public void add(float[] rhs){ - binary(FUNC_ADD, rhs); - } - - public void sub( float[] rhs){ - binary(FUNC_SUB, rhs); - } - - private void unary(int operator, float[] unary){ - unaryOperand = unary; - function=operator; - execute(unaryOperand.length()); - } - - public void sqrt(float[] unary){ - unary(FUNC_SQRT, unary); - } - - private void unary(int operator){ - System.array.copy(result, 0, unaryOperand, 0, result.length); - function=operator; - execute(unaryOperand.length()); - } - - public void sqrt(){ - unary(FUNC_SQRT); - } - - } - - VectorKernel vk = new VectorKernel(SIZE); - vk.add(copyLhs, copyRhs); // copies args to lhs and rhs operands - // sets function type - // and executes kernel - vk.sqrt(); // because we have no arg - // copies result to unary operand - // sets function type - // execute kernel - -However there is one more objection to this approach, namely that it by default will force unnecessarily buffer copies. - -When the bytecode for the above Kernel.run() method is analyzed Aparapi finds bytecode reading from lhsOperand, rhsOperand and unaryOperand arrays/buffers. Obviously at this bytecode analysis stage we can't predict which 'function type' will be used, so on every executions (Kernel.run()) Aparapi must copy all three buffers to the GPU. For binary operations this is one buffer copy wasted (the unaryOperand), for the unary operations we copy two buffers unnecessarily (lhsOperand and rhsOperand). We can of course use explicit buffer management to help us reduce these costs. Ideally we add this to our helper methods. - - class VectorKernel extends Kernel{ - float[] lhsOperand; - float[] rhsOperand; - float[] unaryOperand; - float[] result; - final static int FUNC_ADD =0; - final static int FUNC_SUB =1; - final static int FUNC_SQR =2; - final static int FUNC_SQRT =3; - // other functions - int function; - @Override public void run(){ - int gid = getGlobalId(0){ - if (function==FUNC_ADD){ - result[gid]=lhsOperand[gid]+rhsOperand[gid]; - }else if (function==FUNC_SUB){ - result[gid]=lhsOperand[gid]-rhsOperand[gid]; - }else if (function==FUNC_SQR){ - result[gid]=unaryOperand[gid]*unaryOperand[gid]; - }else if (function==FUNC_ADD){ - result[gid]=sqrt(unaryOperand[gid]); - }else if .... - } - private void binary(int operator, float[] lhs, float[] rhs){ - lhsOperand = lhs; - rhsOperand = rhs; - function=operator; - put(lhsOperand).put(rhsOperand); - execute(lhs.length()); - get(result); - } - public void add(float[] lhs, float[] rhs){ - binary(FUNC_ADD, lhs, rhs); - } - - public void sub(float[] lhs, float[] rhs){ - binary(FUNC_SUB, lhs, rhs); - } - - private void binary(int operator, float[] rhs){ - System.arrayCopy(result, 0, lhsOperand, result.length); - rhsOperand = rhs; - function=operator; - put(lhsOperand).put(rhsOperand); - execute(lhsOperand.legth()); - get(result); - } - - public void add(float[] rhs){ - binary(FUNC_ADD, rhs); - } - - public void sub( float[] rhs){ - binary(FUNC_SUB, rhs); - } - - private void unary(int operator, float[] unary){ - unaryOperand = unary; - function=operator; - put(unaryOperand); - execute(unaryOperand.length()); - get(result); - } - - public void sqrt(float[] unary){ - unary(FUNC_SQRT, unary); - } - - private void unary(int operator){ - System.array.copy(result, 0, unaryOperand, 0, result.length); - function=operator; - put(unaryOperand); - execute(unaryOperand.length()); - get(result); - - } - - public void sqrt(){ - unary(FUNC_SQRT); - } - - } - diff --git a/source/documentation/HSAEnablementOfLambdaBranchSidebar.html.md b/source/documentation/HSAEnablementOfLambdaBranchSidebar.html.md deleted file mode 100644 index b23a0c5d64a18edaa42dfd4d9c681f34071ff144..0000000000000000000000000000000000000000 --- a/source/documentation/HSAEnablementOfLambdaBranchSidebar.html.md +++ /dev/null @@ -1,9 +0,0 @@ ---- - title: HSAEnablementOfLambdaBranchSidebar ---- - -*Sidebar for HSAEnablementOfLambdaBranchAparapi* - -[How to setup a HSA enabled Linux Platform](SettingUpLinuxHSAMachineForAparapi.md) - -[How to setup a HSA simulator on a Linux Platform](UsingAparapiLambdaBranchWithHSASimulator.md) diff --git a/source/documentation/MultipleEntryPointSupportProposal.html.md b/source/documentation/MultipleEntryPointSupportProposal.html.md deleted file mode 100644 index d9668b295d7ac7cbc1024274ce1fe8de151f2b6a..0000000000000000000000000000000000000000 --- a/source/documentation/MultipleEntryPointSupportProposal.html.md +++ /dev/null @@ -1,380 +0,0 @@ ---- - title: MultipleEntryPointSupportProposal ---- - -*How to extend Aparapi to allow multiple entrypoints for kernels Updated Jul 30, 2012 by frost.g...@gmail.com* - -##The Current Single Entrypoint World - -At present Aparapi allows us to dispatch execution to a single 'single entry point' in a Kernel. Essentially for each Kernel only the overridden Kernel.run() method can be used to initiate execution on the GPU. - -Our canonical example is the 'Squarer' Kernel which allows us to create squares for each element in an input array in an output array. - - Kernel squarer = new Kernel(){ - @Overide public void run(){ - int id = getGlobalId(0); - out[id] = in[id] * in[id]; - } - }; - -If we wanted a vector addition Kernel we would have to create a whole new Kernel. - - Kernel adder = new Kernel(){ - @Overide public void run(){ - int id = getGlobalId(0); - out[id] = in[id] * in[id]; - } - }; - -For us to square and then add a constant we would have to invoke two kernels. Or of course create single SquarerAdder kernel. - -See this page EmulatingMultipleEntrypointsUsingCurrentAPI for ideas on how to emulate having multiple methods, by passing data to a single run() method. - -##Why can't Aparapi just allow 'arbitary' methods - -Ideally we would just expose a more natural API, one which allows us to provide specific methods for each arithmetic operation. - -Essentially - - class VectorKernel extends Kernel{ - public void add(); - public void sub(); - public void sqr(); - public void sqrt(); - } - -Unfortunately this is hard to implement using Aparapi. There are two distinct problems, both at runtime. - - How will Aparapi know which of the available methods we want to execute when we call Kernel.execute(range)? - On first execution how does Aparapi determine which methods might be entrypoints and are therefore need to be converted to OpenCL? - -The first problem can be solved by extending Kernel.execute() to accept a method name - - kernel.execute(SIZE, "add"); - -This is the obvious solution, but really causes maintenence issues int that it trades compile time reporting for a runtime errors. If a developer mistypes the name of the method, :- - - kernel.execute(SIZE, "sadd"); // there is no such method - -The code will compile perfectly, only at runtime will we detect that there is no such method. -##An aside - -Maybe the new Java 8 method reference feature method might help here. In the paper below Brian Goetz talks about a double-colon syntax (Class::Method) for directly referencing a method which is presumably checked at compile time. - -So presumably - - kernel.execute(SIZE, VectorKernel::add); - -Would compile just fine, whereby - - kernel.execute(SIZE, VectorKernel::sadd); - -Would yield a compile time error. - -See Brian Goetz's excellent Lambda documentation -##back from Aside - -The second problem (knowing which methods need to be converted to OpenCL) can probably be solved using an Annotation. - - class VectorKernel extends Kernel{ - @EntryPoint public void add(); - @EntryPoint public void sub(); - @EntryPoint public void sqr(); - @EntryPoint public void sqrt(); - public void nonOpenCLMethod(); - } - -Here the @EntryPoint annotation allows the Aparapi runtime to determine which methods need to be exposed. -#My Extension Proposal - -Here is my proposal. Not only does it allow us to reference multiple entryoints, but I think it actually improves the single entrypoint API, albeit at the cost of being more verbose. -##The developer must provide an API interface - -First I propose that we should ask the developer to provide an interface for all methods that we wish to execute on the GPU (or convert to OpenCL). - - interface VectorAPI extends AparapiAPI { - public void add(Range range); - public void sub(Range range); - public void sqrt(Range range); - public void sqr(Range range); - } - -Note that each API takes a Range, this will make more sense in a moment. -##The developer provides a bound implementation - -Aparapi should provide a mechanism for mapping the proposed implementation of the API to it's implementation. - -Note the weasel words here, this is not a conventional implementation of an interface. We will use an annotation (@Implements(Class class)) to provide the binding. - - @Implements(VectorAPI.class) class Vector extends Kernel { - public void add(RangeId rangeId){/*implementation here */} - public void sub(RangeId rangeId){/*implementation here */} - public void sqrt(RangeId rangeId){/*implementation here */} - public void sqr(RangeId rangeId){/*implementation here */} - public void public void nonOpenCLMethod(); - } - -##Why we can't the implementation just implement the interface? - -This would be ideal. Sadly we need to intercept a call to say VectorAPI.add(Range) and dispatch to the resulting Vector.add(RangeId) instances. If you look at the signatures, the interface accepts a Range as it's arg (the range over which we intend to execute) whereas the implementation (either called by JTP threads or GPU OpenCL dispatch) receives a RangeId (containing the unique globalId, localId, etc fields). At the very end of this page I show a strawman implementation of a sequential loop implementation. -##So how do we get an implementation of VectorAPI - -We instantiate our Kernel by creating an instance using new. We then ask this instance to create an API instance. Some presumably java.util.Proxy trickery will create an implementation of the actual instance, backed by the Java implementation. - -So execution would look something like. - - Vector kernel = new Vector(); - VectorAPI kernelApi = kernel.api(); - Range range = Range.create(SIZE); - kernalApi.add(range); - -So the Vector instance is a pure Java implementation. The extracted API is the bridge to the GPU. - -Of course then we can also execute using an inline call through api() - - Vector kernel = new Vector(); - Range range = Range.create(SIZE); - kernel.api().add(range); - kernel.api().sqrt(range); - -or even expose api as public final fields - - Vector kernel = new Vector(); - Range range = Range.create(SIZE); - kernel.api.add(range); - kernel.api.sqrt(range); - -##How would our canonical Squarer example look - - interface SquarerAPI extends AparapiAPI{ - square(Range range); - } - - @Implement(SquarerAPI) class Squarer extends Kernel{ - int in[]; - int square[]; - public void square(RangeId rangeId){ - square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; - } - } - -Then we execute using - - Squarer squarer = new Squarer(); - // fill squarer.in[SIZE] - // create squarer.values[SIZE]; - -squarer.api().square(Range.create(SIZE)); - -#Extending this proposal to allow argument passing - -Note that we have effectively replaced the use of the 'abstract' squarer.execute(range) with the more concrete squarer.api().add(range). - -Now I would like to propose that we take one more step by allowing us to pass arguments to our methods. - -Normally Aparapi captures buffer and field accesses to create the args that it passes to the generated OpenCL code. In our cannonical squarer example the in[] and square[] buffers are captured from the bytecode and passed (behind the scenes) to the OpenCL. - -* **TODO: Add generated OpenCl code to show what this looks like.** * - -However, by exposing the actual method we want to execute, we could also allow the API to accept parameters. - -So our squarer example would go from - - interface SquarerAPI extends AparapiAPI{ - square(Range range); - } - - @Implement(SquarerAPI) class Squarer extends Kernel{ - int in[]; - int square[]; - public void square(RangeId rangeId){ - square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; - } - } - - - Squarer squarer = new Squarer(); - // fill squarer.in[SIZE] - // create squarer.values[SIZE]; - - squarer.api().square(Range.create(SIZE)); - -to - - interface SquarerAPI extends AparapiAPI{ - square(Range range, int[] in, int[] square); - } - - @Implement(SquarerAPI) class Squarer extends Kernel{ - public void square(RangeId rangeId, int[] in, int[] square){ - square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; - } - } - - - Squarer squarer = new Squarer(); - int[] in = // create and fill squarer.in[SIZE] - int[] square = // create squarer.values[SIZE]; - - squarer.api().square(Range.create(SIZE), in, result); - -I think that this makes Aparapi look more conventional. It also allows us to allow overloading for the first time. - - interface SquarerAPI extends AparapiAPI{ - square(Range range, int[] in, int[] square); - square(Range range, float[] in, float[] square); - } - - @Implement(SquarerAPI) class Squarer extends Kernel{ - public void square(RangeId rangeId, int[] in, int[] square){ - square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; - } - public void square(RangeId rangeId, float[] in, float[] square){ - square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; - } - } - - - Squarer squarer = new Squarer(); - int[] in = // create and fill squarer.in[SIZE] - int[] square = // create squarer.values[SIZE]; - - squarer.api().square(Range.create(SIZE), in, result); - float[] inf = // create and fill squarer.in[SIZE] - float[] squaref = // create squarer.values[SIZE]; - - squarer.api().square(Range.create(SIZE), inf, resultf); - ---- - -test harness - - import java.lang.reflect.InvocationHandler; - import java.lang.reflect.Method; - import java.lang.reflect.Proxy; - - - public class Ideal{ - - public static class OpenCLInvocationHandler<T> implements InvocationHandler { - Object instance; - OpenCLInvocationHandler(Object _instance){ - instance = _instance; - } - @Override public Object invoke(Object interfaceThis, Method interfaceMethod, Object[] interfaceArgs) throws Throwable { - Class clazz = instance.getClass(); - - Class[] argTypes = interfaceMethod.getParameterTypes(); - argTypes[0]=RangeId.class; - Method method = clazz.getDeclaredMethod(interfaceMethod.getName(), argTypes); - - - if (method == null){ - System.out.println("can't find method"); - }else{ - RangeId rangeId = new RangeId((Range)interfaceArgs[0]); - interfaceArgs[0]=rangeId; - for (rangeId.wgid = 0; rangeId.wgid <rangeId.r.width; rangeId.wgid++){ - method.invoke(instance, interfaceArgs); - } - } - - return null; - } - } - - static class Range{ - int width; - Range(int _width) { - width = _width; - } - } - - static class Range2D extends Range{ - int height; - - Range2D(int _width, int _height) { - super(_width); - height = _height; - } - } - - static class Range1DId<T extends Range>{ - Range1DId(T _r){ - r = _r; - } - T r; - - int wgid, wlid, wgsize, wlsize, wgroup; - } - - static class RangeId extends Range1DId<Range>{ - RangeId(Range r){ - super(r); - } - } - - static class Range2DId extends Range1DId<Range2D>{ - Range2DId(Range2D r){ - super(r); - } - - int hgid, hlid, hgsize, hlsize, hgroup; - } - - - - - - static <T> T create(Object _instance, Class<T> _interface) { - OpenCLInvocationHandler<T> invocationHandler = new OpenCLInvocationHandler<T>(_instance); - T instance = (T) Proxy.newProxyInstance(Ideal.class.getClassLoader(), new Class[] { - _interface, - - }, invocationHandler); - return (instance); - - } - - - - public static class Squarer{ - interface API { - public API foo(Range range, int[] in, int[] out); - public Squarer dispatch(); - - } - - public API foo(RangeId rangeId, int[] in, int[] out) { - out[rangeId.wgid] = in[rangeId.wgid]*in[rangeId.wgid]; - return(null); - } - } - - /** - * @param args - */ - public static void main(String[] args) { - - Squarer.API squarer = create(new Squarer(), Squarer.API.class); - int[] in = new int[] { - 1, - 2, - 3, - 4, - 5, - 6 - }; - int[] out = new int[in.length]; - Range range = new Range(in.length); - - squarer.foo(range, in, out); - - for (int s:out){ - System.out.println(s); - } - - } - - } - diff --git a/source/documentation/NewOpenCLBinding.html.md b/source/documentation/NewOpenCLBinding.html.md deleted file mode 100644 index bb32a8855aa191980a6d283db2661515a6cfccad..0000000000000000000000000000000000000000 --- a/source/documentation/NewOpenCLBinding.html.md +++ /dev/null @@ -1,54 +0,0 @@ ---- - title: NewOpenCLBinding ---- - -*How to use new OpenCL binding mechanism. Updated Mar 6, 2012 by frost.g...@gmail.com* -As a step towards the extension mechanism I needed a way to easily bind OpenCL to an interface. - -Here is what I have come up with. We will use the 'Square' example. - -You first define an interface with OpenCL annotations.. - - interface Squarer extends OpenCL<Squarer>{ - @Kernel("{\n"// - + " const size_t id = get_global_id(0);\n"// - + " out[id] = in[id]*in[id];\n"// - + "}\n")// - public Squarer square(// - Range _range,// - @GlobalReadOnly("in") float[] in,// - @GlobalWriteOnly("out") float[] out); - } - -This describes the API we wish to bind to a set of kernel entrypoints (here we only have one, but we could have many). Then you 'realize' the interface by asking a device to create an implementation of the interface. Device is a new Aparapi class which represents a GPU or CPU OpenCL device. So here we are asking for the first (default) GPU device to realize the interface. - - Squarer squarer = Device.firstGPU(Squarer.class); -Now you can call the implementation directly with a Range. - - squarer.square(Range.create(in.length), in, out); -I think that we will have the easiest OpenCL binding out there... - -Following some conversations/suggestions online http://a-hackers-craic.blogspot.com/2012/03/aparapi.html we could also offer the ability to provide the OpenCL source from a file/url course using interface level Annotations. - -So we could allow. - - @OpenCL.Resource("squarer.cl"); - interface Squarer extends OpenCL<Squarer>{ - public Squarer square(// - Range _range,// - @GlobalReadOnly("in") float[] in,// - @GlobalWriteOnly("out") float[] out); - } -Or if the text is on-hand at compile time in a single constant string - - @OpenCL.Source("... opencl text here"); - interface Squarer extends OpenCL<Squarer>{ - public Squarer square(// - Range _range,// - @GlobalReadOnly("in") float[] in,// - @GlobalWriteOnly("out") float[] out); - } -Finally to allow for creation of dynamicl OpenCL (good for FFT's of various Radii). - - String openclSource = ...; - Squarer squarer = Device.firstGPU(Squarer.class, openclSource); diff --git a/source/documentation/PossibleAparapiLambdaSyntaxOptions.html.md b/source/documentation/PossibleAparapiLambdaSyntaxOptions.html.md deleted file mode 100644 index 8c0085b7e742e693b6501d2e5d82991dd6558a0d..0000000000000000000000000000000000000000 --- a/source/documentation/PossibleAparapiLambdaSyntaxOptions.html.md +++ /dev/null @@ -1,99 +0,0 @@ ---- - title: PossibleAparapiLambdaSyntaxOptions ---- - -*syntax suggestions for HSA enabled Aparapi* - -#Introduction -Now that Java 8 is nearly upon us and HSA enabled Aparapi 'lambda' branch is usable (though in no way complete) I figured we could use this page to discuss the 'programming model' we might prefer for Aparapi, and contrast with the API's for the new Java 8 lambda based stream APIs. - -##Converting between Aparapi HSA + Java 8 enabled Aparapi -Our **hello world** app has always been the ''vector add''. In classic Aparapi we could transform - - final float inA[] = .... // get a float array from somewhere - final float inB[] = .... // get a float from somewhere - // assume (inA.length==inB.length) - final float result = new float[inA.length]; - - for (int i=0; i<array.length; i++){ - result[i]=intA[i]+inB[i]; - } -to - - Kernel kernel = new Kernel(){ - @Override public void run(){ - int i= getGlobalId(); - result[i]=intA[i]+inB[i]; - } - }; - Range range = Range.create(result.length); - kernel.execute(range); -For the lambda aparapi branch we can currently use - - Device.hsa().forEach(result.length, i-> result[i]=intA[i]+inB[i]); -Note that the closest Java 8 construct is - - IntStream.range(0, result.length).parallel().forEach(i-> result[i]=intA[i]+inB[i]); -Aparapi and Java 8 stream API's both use IntConsumer as the lambda type. So you can reuse the lambda. - - IntConsumer lambda = i-> result[i]=intA[i]+inB[i]; - - IntStream.range(0, result.length).parallel().forEach(lambda); - Device.hsa().forEach(result.length, lambda); -Exposing the Deviceness of this was a conscious effort. We may also hide it completely. - - IntConsumer lambda = i-> result[i]=intA[i]+inB[i]; - - IntStream.range(0, result.length).parallel().forEach(lambda); - Aparapi.forEach(result.length, lambda); -I am toying with providing an API which maps more closely to the Stream API from Java 8. - -Maybe - - IntStream.range(0, result.length).parallel().forEach(lambda); - Aparapi.range(0, result.length).parallel().forEach(lambda); -This way users can more readily swap between the two. - -For collections/arrays in Aparapi we can also offer - - T[] arr = // get an array of T from somewhere - ArrayList<T> list = // get an array backed list of T from somewhere - - Aparapi.range(arr).forEach(t -> /* do something with each T */); -We can create special cases. Say for mutating images - - BufferedImage in, out; - Aparapi.forEachPixel(in, out, rgb[] -> rgb[0] = 0 ); -We may also need select operations for associative operations - - class Person{ - int age; - String first; - String last; - }; - - Aparapi.selectOne(Person[] people, (p1,p2)-> p1.age>p2.age?p1:p2 ); -##A case for map reduce -A mapper maps from one type to another. Possibly by extracting state. Here is a mapper which maps each String in an array of Strings to its length. - -As if the mapper was - - interface mapToInt<T>{ int map(T v); } -Here it is in action. - - Aparapi.range(strings).map(s->string.length())... -Now the result is a stream of int's which can be 'reduced' by a reduction lambda. - -In this case the reduction reduces two int's to one, by choosing the max of k and v. All reductions must be commutative style operations (max, min, add) where the order of execution is not important. - - int lengthOfLongestString = Aparapi.range(strings).map(s->string.length()).reduce((k,v)-> k>v?k:v); -Here we had a sum reduction. - - int sumOfLengths = Aparapi.range(strings).map(s ->string.length()).reduce((k,v)-> k+v); -Some of these may be common enough that we offer direct functionality. - - int sumOfLengths = Aparapi.range(strings).map(s ->string.length()).sum(); - int maxOfLengths = Aparapi.range(strings).map(s ->string.length()).max(); - int minOfLengths = Aparapi.range(strings).map(s ->string.length()).min(); - String string = Aparapi.range(strings).map(s->string.length()).select((k,v)-> k>v); -This last one needs some explaining. We map String to int then select the String whose length is the greatest. \ No newline at end of file diff --git a/source/documentation/ProfilingKernelExecution.html.md b/source/documentation/ProfilingKernelExecution.html.md deleted file mode 100644 index 4b630552c5db76a82c52a1d57b3f4cf7f46ffadb..0000000000000000000000000000000000000000 --- a/source/documentation/ProfilingKernelExecution.html.md +++ /dev/null @@ -1,56 +0,0 @@ ---- - title: ProfilingKernelExecution ---- - -*Using Aparapi's built in profiling APIs Updated May 7, 2013 by frost.g...@gmail.com* - -If you want to extract OpenCL performance info from a kernel at runtime you need to set the property :- - - -Dcom.aparapi.enableProfiling=true - -Your application can then call kernel.getProfileInfo() after a successful call to kernel.execute(range) to extract a List List<ProfileInfo>. - -Each ProfileInfo holds timing information for buffer writes, executs and buffer reads. - -The following code will print a simple table of profile information - - List<ProfileInfo> profileInfo = k.getProfileInfo(); - for (final ProfileInfo p : profileInfo) { - System.out.print(" " + p.getType() + " " + p.getLabel() + " " + (p.getStart() / 1000) + " .. " - + (p.getEnd() / 1000) + " " + ((p.getEnd() - p.getStart()) / 1000) + "us"); - System.out.println(); - } - -Here is an example implementation - - final float result[] = new float[2048*2048]; - Kernel k = new Kernel(){ - public void run(){ - final int gid=getGlobalId(); - result[gid] =0f; - } - }; - k.execute(result.length); - List<ProfileInfo> profileInfo = k.getProfileInfo(); - - for (final ProfileInfo p : profileInfo) { - System.out.print(" " + p.getType() + " " + p.getLabel() + " " + (p.getStart() / 1000) + " .. " - + (p.getEnd() / 1000) + " " + ((p.getEnd() - p.getStart()) / 1000) + "us"); - System.out.println(); - } - k.dispose(); - } - } -And here is the tabular output from - - java - -Djava.library.path=${APARAPI_HOME} - -Dcom.aparapi.enableProfiling=true - -cp ${APARAPI_HOME}:. - MyClass - - W val$result 69500 .. 72694 3194us - X exec() 72694 .. 72835 141us - R val$result 75327 .. 78225 2898us - -The table shows that the transfer of the 'result' buffer to the device ('W') took 3194 us (micro seconds), the execute ('X') of the kernel 141 us and the read ('R') of resulting buffer 2898 us. \ No newline at end of file diff --git a/source/documentation/ProfilingKernelsFromEclipse.html.md b/source/documentation/ProfilingKernelsFromEclipse.html.md deleted file mode 100644 index 181cd244774c7bb3f72c818f5c7f5133a05b28c1..0000000000000000000000000000000000000000 --- a/source/documentation/ProfilingKernelsFromEclipse.html.md +++ /dev/null @@ -1,100 +0,0 @@ ---- - title: ProfilingKernelsFromEclipse ---- - -*Profiling Kernels with AMD profiler in Eclipse (Indigo) Updated May 14, 2012 by frost.g...@gmail.com* - -##Profiling Kernels with AMD profiler in Eclipse (Indigo) - -Wayne Johnson - -12 May 2012 -Disclaimer: This has been tested with Eclipse (Indigo SR1) only on W7SR1. - -Assume your Eclipse project follows a typical Maven layout: - - Project - src/main/java/... - AlgorithmImplementation.java - src/test/java/... - BenchmarkRunner.java - BenchmarkTest.java - lib/aparapi-2012-02-15/ - aparapi jar file - native libraries for W7, Linux, and OSX - … - profiles/ - [this is where the profiles and logs will be generated] - -1. Download and install the current AMD APP SDK -2. Download and install Aparapi (see Wiki), making sure that the native libraries are on your build path. -3. Create your algorithm implementation(s). - - example: AlgorithmImplementations.java - -4. Create your performance benchmark test as a JUnit test case to exercise your implementations. - - example: BenchmarkTest.java - -5. Test your JUnit test case inside Eclipse using BenchmarkRunner to make sure it works. The runner will be the main application for the runnable jar file you create in the next step. - - This step will also automatically create the launch configuration that the export command will ask you for. Select BenchmarkRunner.java - - Right-click > Run as > Java application - -6. Export your project as a runnable jar file. - - Right-click > Export... - [wizard] Java > Runnable Jar File. Next. - Launch configuration: BenchmarkRunner [1] - Project - Export destination: Project\runner.jar - Library handling: [use default] Finish. - Ok on “...repacks referenced libraries†- Yes on “Confirm replace†[You won’t see this dialog on the first export but will on subsequent exports] - Ok [ignore warning dialog] - - After refreshing Project, you should see a runner.jar file at the top level. - -7. Create an external tool configuration to generate the performance counter profile - - Run > External Tools > External Tool Configurations... - Name: AMD counters - Project - Location: C:\Program Files (x86)\AMD APP\tools\AMD APP Profiler 2.4\x64\sprofile.exe - Arguments: - -o "${project_loc}\profiles\counters.csv" - -w "${project_loc}" - "C:\Program Files\Java\jdk1.6.0_30\bin\java.exe" - -Djava.library.path="lib\aparapi-2012-02-15" - -jar "${project_loc}\runner.jar" - - - Note: The ''java.library.path'' indicates the relative location of the folder containing the native libraries used by Aparapi. If this is not set correctly, steps 9 and 10 below will run in JTP execution mode and the only error message you will see on the Eclipse console is that the profile was not generated. This is because nothing executed on the GPU. - -8. Create an external tool configuration to generate the cltrace and summary profiles. - - 1. Run > External Tools > External Tool Configurations... - 2. Name: AMD cltrace - Project - 3. Location: C:\Program Files (x86)\AMD APP\tools\AMD APP Profiler 2.4\x64\sprofile.exe - 4. Arguments: - - `-o "${project_loc}\profiles\cltrace.txt" -k all -r -O -t -T` - - `-w "${project_loc}"` - - `"C:\Program Files\Java\jdk1.6.0_30\bin\java.exe"` - - `-Djava.library.path="lib\aparapi-2012-02-15"` - - `-jar "${project_loc}\runner.jar"` - - -9. Run the AMD profiler counter configuration to generate the counter profile. - - Run > External Tools > AMD counters - Project - - -10. Run the AMD profiler cltrace configuration to generate the cltrace and summary profiles. - - Run > External Tools > AMD cltrace - Project - A project file for testing the above instructions can be found http://code.google.com/p/aparapi/source/browse/trunk/wiki-collateral/ProfilingKernelsFormEclipseProject.zip - diff --git a/source/documentation/SettingUpLinuxHSAMachineForAparapiSidebar.html.md b/source/documentation/SettingUpLinuxHSAMachineForAparapiSidebar.html.md deleted file mode 100644 index 656ad72e0719923f02d704389caac1dbe66c9b63..0000000000000000000000000000000000000000 --- a/source/documentation/SettingUpLinuxHSAMachineForAparapiSidebar.html.md +++ /dev/null @@ -1,13 +0,0 @@ ---- - title: SettingUpLinuxHSAMachineForAparapiSidebar ---- - -*Sidebar for SettingUpLinuxHSAMachineForAparapi* - -* HSA Videos - * [http://www.youtube.com/watch?v=5ntILiXTuhE](http://www.youtube.com/watch?v=5ntILiXTuhE) - * [http://www.youtube.com/watch?v=caEPq4KvTTA](http://www.youtube.com/watch?v=caEPq4KvTTA) -* HSA Articles - * [http://developer.amd.com/resources/heterogeneous-computing/what-is-heterogeneous-computing/](http://developer.amd.com/resources/heterogeneous-computing/what-is-heterogeneous-computing/) -* HSA Foundation - * [https://github.com/HSAFoundation](https://github.com/HSAFoundation) \ No newline at end of file diff --git a/source/documentation/aparapi-patterns.html.md b/source/documentation/aparapi-patterns.html.md new file mode 100644 index 0000000000000000000000000000000000000000..3325b842f72d5e67e6d41d8c8960bd6b29e31162 --- /dev/null +++ b/source/documentation/aparapi-patterns.html.md @@ -0,0 +1,141 @@ +--- +title: Aparapi Patterns +description: Examples and code fragments to demonstrate Aparapi features. +--- + +##Aparapi Patterns + +The following suggestions help solve some common problems found in using Aparapi. + +Additional suggestions and solutions to extend this list would be welcome. + +##How do I return data from a kernel if I can’t write to kernel fields? + +Use a small array buffer (possibly containing a single element) and assign it from the kernel. + +For example, the following kernel code detects whether the `buffer[]` contains the value `1234`. The flag (true or false) is returned in `found[0]`. + +```java + +final int buffer[] = new int[HUGE]; +final boolean found[] = new boolean[]{false}; +// fill buffer somehow + kernel kernel = new kernel(){ + @Override public void run(){ + if (buffer[getGlobald()]==1234){ + found[0]=true; + } + } +}; +kernel.execute(buffer.length); +``` + +This code does include a race condition, whereby more than one value of `Kernel.getGlobalId()` might contain 1234 and try to set `found[0]`. This is not a problem here, because we don't care if multiple kernel executions match, provided one flips the value of `found[0]`. + +##How can I use Aparapi and still maintain an object-oriented view of my data? + +See the NewFeatures page. Aparapi can now handle simple arrays of objects, which minimizes the amount of refactoring required to experiment with Aparapi. However, performance is still likely to be better if your algorithm operates on data held in parallel primitive arrays. To get higher performance from Aparapi with minimal exposure to data in this parallel primitive array form, we can (with a little work) allow both forms of data to co-exist. Let’s reconsider [the NBody problem](http://en.wikipedia.org/wiki/N-body_problem). + +A Java developer writing an NBody solution would most likely create a Body class: + +```java + +class Body{ + float x,y,z; + float getX(){return x;} + void setX(float _x){ x = _x;} + float getY(){return y;} + void setY(float _y){ y = _y;} + float getZ(){return z;} + void setZ(float _z){ z = _z;} + + + // other data related to Body unused by positioning calculations +} +``` + +The developer would also likely create a container class (such as NBodyUniverse), that manages the positions of multiple Body instances. + +```java + +class NBodyUniverse{ + final Body[] bodies = null; + NBodyUniverse(final Bodies _bodies[]){ + bodies = _bodies; + for (int i=0; i<bodies.length; i++){ + bodies[i].setX(Math.random()*100); + bodies[i].setY(Math.random()*100); + bodies[i].setZ(Math.random()*100); + } + } + void adjustPositions(){ + // can use new array of object Aparapi features, but is not performant + } +} +Body bodies = new Body[BODIES]; +for (int i=0; i<bodies; i++){ + bodies[i] = new Body(); +} +NBodyUniverse universe = new NBodyUniverse(bodies); +while (true){ + universe.adjustPositions(); + // display NBodyUniverse +} +``` + +The `NBodyUniverse.adjustPostions()` method contains the nested loops (adjusting each body position based on forces impinging on it from all of the other bodies), making it an ideal Aparapi candidate. + +Even though this code can now be written by accessing the x, y and z ordinates of `Body[]` via getters/setters, the most performant Aparapi implementation is the one that operates on parallel arrays of floats containing x, y and z ordinates, with `Body[10]`’s state conceptually stored across `x[10]`, `y[10]` and `z[10]`. + +So for performance reasons, you can do something like this: + +```java + +class Body{ + int idx; + NBodyUniverse universe; + void setUniverseAndIndex(NBodyUniverse _universe, int _idx){ + universe = _universe; + idx = _idx; + } + + // other fields not used by layout + + void setX(float _x){ layout.x[idx]=_x;} + void setY(float _y){ layout.y[idx]=_y;} + void setZ(float _z){ layout.z[idx]=_z;} + float getX(){ return layout.x[idx];} + float getY(){ return layout.y[idx];} + float getZ(){ return layout.z[idx];} +} +class NBodyUniverse { + final Body[] bodies; + final int[] x, y, z; + NBodyUniverse(Body[] _bodies){ + bodies = _bodies; + for (int i=0; i<bodies.length; i++){ + bodies[i].setUniverseAndIndex(this, i); + bodies[i].setX(Math.random()*100); + bodies[i].setY(Math.random()*100); + bodies[i].setZ(Math.random()*100); + } + } + void adjustPositions(){ + // can now more efficiently use Aparapi + } +} + + + +Body bodies = new Body[BODIES]; +for (int i=0; i<bodies; i++){ + bodies[i] = new Body(); +} +NBodyUniverse universe = new NBodyUniverse(bodies); +while (true){ + universe.adjustPositions(); + // display NBodyUniverse +} +``` + +This example allows Javaâ„¢ code to treat each Body in a traditional object-oriented fashion and also allows Aparapi kernels to act on the parallel primitive array form, in order to access/mutate the position of the bodies. \ No newline at end of file diff --git a/source/documentation/choosing-specific-devices.html.md b/source/documentation/choosing-specific-devices.html.md new file mode 100644 index 0000000000000000000000000000000000000000..28a48bd73488bffd5b80faa5c44fc8b1e8143b52 --- /dev/null +++ b/source/documentation/choosing-specific-devices.html.md @@ -0,0 +1,78 @@ +--- +title: Choosing Specific Devices +description: Using the new Device API's to choose Kernel execution on a specific device. +--- + +Previously Aparapi chose the first GPU device when `Kernel.execute()` was called. This make it easy to execute simple Kernels, but was problematic when users wished finer control over which device should be chosen. Especially when the first device may be unsuitable. We recently added new classes and API's to allow the developer to specify exactly which device we intend to target. + +A new Device class has been added. This allows the user to select a specific device; either by calling a helper method `Device.firstGPU()` or `Device.best()`. Or by allowing the user to iterate through all devices and choose one based on some other criteria (capabilities? vendor name?). + +So selecting the 'best' (most performant) device could be achieved using. + +```java + +Device device = Device.best(); +``` + +Alternatively if I wanted the first AMD GPU device I might use:- + +```java + +Device chosen=null; +for (Device device: devices.getAll()){ + if (device.getVendor().contains("AMD") && device.isGPU()){ + chosen = device; + break; + } +} +``` + +A Device can be queried (`isGPU()`, `isOpenCL()`, `isGroup()`, `isJava()`, `getOpenCLPlatform()`, `getMaxMemory()`, `getLocalSizes()`) to yield it's characteristics. + +To execute on a specific device we must use the device to create our range. + +```java + +Range range = device.createRange2D(width, height); +``` + +This allows the Range to be created with knowledge of the underlying device. So for example `device.createRange3D(1024, 1024, 1024, 16, 16, 16)` will fail if the device does not allow a local size of (16x16x16). + +A range created using a device method captures the device which created it. The range instance has a device field which is set by the device which creates it. + +It's as if we had this code + +```java + +Range range = Range.create(width, height); +range.setDevice(device); +``` + +So the Range locks the device that it can be used with. + +Now when we have a Kernel. + +```java + +Kernel kernel = new Kernel(){ + @Override public void run(){ + ... + } +} +``` + +And we then use a device created range. + +```java + +Device device = Device.firstGPU(); +Kernel kernel = new Kernel(){ + @Override public void run(){ + // uses input[]; + } +}; +range = device.createRange2D(1024, 1024); +kernel.execute(range); +``` + +We have forced execution on the first GPU. diff --git a/source/documentation/UsingConstantMemory.html.md b/source/documentation/constant-memory.html.md similarity index 75% rename from source/documentation/UsingConstantMemory.html.md rename to source/documentation/constant-memory.html.md index 490a294795a3e85194122f8d64f369e444ab6441..c587e022892fdc807cfb4e3f2f678612afc22f2b 100644 --- a/source/documentation/UsingConstantMemory.html.md +++ b/source/documentation/constant-memory.html.md @@ -1,9 +1,10 @@ --- - title: UsingConstantMemory +title: Constant Memory +description: How to make use of constant memory in a Kernel. --- -*How to make use of constant memory in a Kernel Updated Feb 28, 2012 by frost.g...@gmail.com* ##How to make use of new constant memory feature + By default all primitive arrays accessed by an Aparapi Kernel is considered global. If we look at the generated code using `-Dcom.aparapi.enableShowGeneratedOpenCL=true` we will see that primitive arrays (such as `int buf[]`) are mapped to `__global` pointers (such as `__global int *buf`) in OpenCL. Although this makes Aparapi easy to use (especially to Java developers who are unfamiliar to tiered memory hierarchies), it does limit the ability of the 'power developer' wanting to extract more performance from Aparapi on the GPU. @@ -23,29 +24,35 @@ Aparapi only supports constant arrays, not scalers. ##How to define a primitive array as "constant" We have two ways define a constant buffer. Either we can decorate the variable name with a _$constant$ suffix (yes it is a valid identifier n Java). - final int[] buffer = new int[1024]; // this is global accessable to all work items. - final int[] buffer_$constant$ = new int[]{1,2,3,4,5,6,7,8,9} // this is a constant buffer +```java + +final int[] buffer = new int[1024]; // this is global accessable to all work items. +final int[] buffer_$constant$ = new int[]{1,2,3,4,5,6,7,8,9} // this is a constant buffer - Kernel k = new Kernel(){ - public void run(){ - // access buffer - // access buffer_$constant$ - // .... - } +Kernel k = new Kernel(){ + public void run(){ + // access buffer + // access buffer_$constant$ + // .... } +} +``` Alternatively (if defining inside the derived Kernel class - cannot be used via anonymous inner class pattern above!) we can can use the @Constant annotation. - final int[] buffer = new int[1024]; // this is global accessable to all work items. +```java + +final int[] buffer = new int[1024]; // this is global accessable to all work items. - Kernel k = new Kernel(){ - @Constant int[] constantBuffer = new int[]{1,2,3,4,5,6,7,8,9} // this is a constant buffer - public void run(){ - // access buffer - // access constantBuffers - // .... - } +Kernel k = new Kernel(){ + @Constant int[] constantBuffer = new int[]{1,2,3,4,5,6,7,8,9} // this is a constant buffer + public void run(){ + // access buffer + // access constantBuffers + // .... } +} +``` ##Can I see some code? I updated the Mandelbrot example so that the pallete of RGB values is represented using constant memory, the source can be found here. Look at line #95. BTW for me this resulted in a 5-7 % performance improvement. diff --git a/source/documentation/converting-java-to-opencl.html.md b/source/documentation/converting-java-to-opencl.html.md new file mode 100644 index 0000000000000000000000000000000000000000..fe477f655e5f7fdb6aee44686eab51037e779a30 --- /dev/null +++ b/source/documentation/converting-java-to-opencl.html.md @@ -0,0 +1,346 @@ +--- +title: Converting Java to OpenCL +description: How Aparapi converts bytecode to OpenCL +--- + +##Introduction + +This page acts as a quick summary for [the more detailed PDF](/documentation/ByteCode2OpenCL.pdf) that was originally written by AMD. + +One of the unique Aparapi features is it's ability to convert Java bytecode to OpenCL automatically. + +In this page we will try to describe the process used to perform this conversion. If you are unfamiliar with bytecode consider visiting this page WhatIsBytecode. + +The command + +```java + +javac Source.java +``` + +Will compile the java source file Source.java to Source.class + +The classfile format is well documented here and we will not go into too much detail here, however it should be known that Aparapi must parse the classfile of each Kernel to extract the bytecode for the `Kernel.run()` and any method reachable from `Kernel.run()`. + +Lets start with a simple Kernel. + +```java + +import com.aparapi.Kernel; + +public class Squarer extends Kernel{ + int[] in; + int[] out; + @Override public void run(){ + int gid = getGlobalId(0); + out[gid] = in[gid] * in[gid]; + } +} +``` + +We will compile this + +```java + +javac -g -cp path/to/aparapi/aparapi.jar Squarer.java +``` + +and then we can look at the bytecode using javap + +```java + +javap -c -classpath path/to/aparapi/aparapi.jar;. Squarer +``` + +Compiled from "Squarer.java" + +```java + +public class Squarer extends com.aparapi.Kernel + SourceFile: "Squarer.java" + minor version: 0 + major version: 50 + Constant pool: +const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V +const #2 = Method #5.#18; // Squarer.getGlobalId:(I)I +const #3 = Field #5.#19; // Squarer.out:[I +const #4 = Field #5.#20; // Squarer.in:[I +const #5 = class #21; // Squarer +const #6 = class #22; // com/amd/aparapi/Kernel +const #7 = Asciz in; +const #8 = Asciz [I; +const #9 = Asciz out; +const #10 = Asciz <init>; +const #11 = Asciz ()V; +const #12 = Asciz Code; +const #13 = Asciz LineNumberTable; +const #14 = Asciz run; +const #15 = Asciz SourceFile; +const #16 = Asciz Squarer.java; +const #17 = NameAndType #10:#11;// "<init>":()V +const #18 = NameAndType #23:#24;// getGlobalId:(I)I +const #19 = NameAndType #9:#8;// out:[I +const #20 = NameAndType #7:#8;// in:[I +const #21 = Asciz Squarer; +const #22 = Asciz com/amd/aparapi/Kernel; +const #23 = Asciz getGlobalId; +const #24 = Asciz (I)I; + +{ +int[] in; + +int[] out; + +public Squarer(); + Code: + Stack=1, Locals=1, Args_size=1 + 0: aload_0 + 1: invokespecial #1; //Method com/amd/aparapi/Kernel."<init>":()V + 4: return + + +public void run(); + Code: + Stack=5, Locals=2, Args_size=1 + 0: aload_0 + 1: iconst_0 + 2: invokevirtual #2; //Method getGlobalId:(I)I + 5: istore_1 + 6: aload_0 + 7: getfield #3; //Field out:[I + 10: iload_1 + 11: aload_0 + 12: getfield #4; //Field in:[I + 15: iload_1 + 16: iaload + 17: aload_0 + 18: getfield #4; //Field in:[I + 21: iload_1 + 22: iaload + 23: imul + 24: iastore + 25: return +} +``` + +Here we see constant pool of the class and the disassembled bytecode of the default constructor `Squarer()` and the `Squarer.run()` method. + +The constant pool is a table of constant values that can be accessed from the bytecode of any methods from within this class. Some of the constants are String literals defined within the source (or literals used to name classes, fields, methods, variables or signatures), other slots represent Classes, Methods, Fields or Type signatures. These later constant pool entries cross-reference other constant pool entries to describe higher level artifact. + +For example constant pool entry #1 is + +```java + +const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V +``` + +So entry #1 defines a method. The class containing the method is defined in constant pool entry #6. So lets look at constant pool entry #6. + +```java + +const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V + +const #6 = class #22; // com/amd/aparapi/Kernel +``` + +At constant pool entry #6 we find a class definition which refers to entry #22 + +```java + +const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V + +const #6 = class #22; // com/amd/aparapi/Kernel + +const #22 = Asciz com/amd/aparapi/Kernel; +``` + +Which just contains the String (Ascii) name of the class. + +Looking back at entry #1 again, we note that the Method also references entry #17 which contains a NameAndType entry for determining the method name and the signature. + +```java + +const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V + +const #6 = class #22; // com/amd/aparapi/Kernel + + +const #17 = NameAndType #10:#11;// "<init>":()V + +const #22 = Asciz com/amd/aparapi/Kernel; +``` + +Entry #17's "NameAndType" references #10 for the method name. + +```java + +const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V + +const #6 = class #22; // com/amd/aparapi/Kernel + +const #10 = Asciz <init>; + +const #17 = NameAndType #10:#11;// "<init>":()V + +const #22 = Asciz com/amd/aparapi/Kernel; +``` + +And then references #11 to get the signature. + +```java + +const #1 = Method #6.#17; // com/amd/aparapi/Kernel."<init>":()V + +const #6 = class #22; // com/amd/aparapi/Kernel + +const #10 = Asciz <init>; + +const #11 = Asciz ()V; + +const #17 = NameAndType #10:#11;// "<init>":()V + +const #22 = Asciz com/amd/aparapi/Kernel; +``` + +So from constant pool #1 we ended up using slots 1,6,10,11,17 and 22 to fully resolve the method. + +This looks like a lot of work, however by breaking method and field references up like this, allows the various slots to be reused by other field/method descriptions. + +So when we see disassembled bytecode which references a constantpool slot the actual slot # (2 in the example below) will appear after the bytecode for invokevirtual. + +```java + +2: invokevirtual #2; Method getGlobalId:(I)I +``` + +Bytecode is basically able to access three things + +1. Constant pool entries +2. Variable slots +3. Stack operands + +Instructions are able to pop operands from the stack, push operands to the stack, load values from variable slots (to the stack), store values (from the stack) to variable slots, store values from accessed fields (to the stack) and call methods (popping args from the stack). + +Some instructions can only handle specific types (int, float, double, and object instances - arrays are special forms of objects) and usually the first character of the instruction helps determine which type the instruction acts upon. So imul would be a multiply instruction that operates on integers, fmul would multiply two floats, dmul for doubles. Instructions that begin with 'a' operate on object instances. + +So lets look at the first instruction. + +```java + +0: aload_0 +``` + +This instruction loads an object (a is the first character) from variable slot 0 (we'll come back to the variable slots in a moment) and pushes it on the stack. + +Variables are held in 'slots' that are reserved at compiled time. + +Consider this static method. + +```java + +static int squareMe(int value){ + value += value; + return(value); +} +``` + +This method requires one variable slot. At any one time there is only one variable that is live, it just happens to be an argument to the method. + +The following method also contains one slot. + +```java + +static int squareMe(){ + int value=4; + value += value; + return(value); +} +``` + +Here we need two slots + +```java + +static int squareMe(int arg){ + int value=arg*arg; + return(value); +} +``` + +Suprisingly the following also only requires two slots. + + static int squareMe(int arg){ + { + int temp = arg*arg; + } + int value=arg*arg; + return(value); + } + +Note that in the above example the temp variable loses scope before the local variable value is used. So only two slots are required. Both temp and value can share a slot. + +If we have an instance method we always require one extra slot (always slot 0) for the this reference. + +So + +```java + +int squareMe(int arg){ + int value=arg*arg; + return(value); +} +``` + +Requires three slots. + +Anyway back to our bytecode + +```java + +0: aload_0 +``` + +This loads the object instance in slot 0 (this) and pushes it on the stack. + +Next we have + +```java + +1: iconst_0 +``` + +Which pushes the int constant 0 on the stack. So the stack contains {this,0} + +Next we have + +```java + +2: invokevirtual #2; //Method getGlobalId:(I)I +``` + +This is the bytecode for calling a method. Basically the instruction itself references the constant pool (we'll come back to this ;) ) and pulls the method description in `constantPool2` which happens to be the description for a method called `getGlobalId()` which takes an integer and returns an `int`. + +So the VM will pop the top value `(int - const 0)` as the method arg, and then will pop an object reference (this!) and will call the method `this.getGlobalId(0)` and will push the result (an int) back on the stack. + +So our stack which contains `{this,0}` now contains the result of this.getGlobalId(0), lets assume it is `{0}`. We describe this invoke instruction as consuming two operands from the stack and producing one. + +Before we start executing our stack is empty {}, the slots are initialized with 'this' (if an instance method) and any arguments passed to the method. + +```java + + 0 1 + slots=[this, ? ] stack={} + + 0 1 +0: aload_0 slots=[this, ? ] stack={this} + 0 1 +1: iconst_0 slots=[this, ? ] stack={this, 0} + 0 1 +2: invokevirtual #2; Method getGlobalId:(I)I slots=[this, ? ] stack={result of this.getGlobalId(0) lets say 0} + +5: istore_1 slots=[this, 0 ] stack={} + +6: aload_0 slots=[this, 0 ] stack={this} + +7: getfield #3; //Field out:[I +``` diff --git a/source/documentation/emulating-multiple-entrypoints.html.md b/source/documentation/emulating-multiple-entrypoints.html.md new file mode 100644 index 0000000000000000000000000000000000000000..567b5f72f69b0023dfbd4a0a1188e6b56ee0ff10 --- /dev/null +++ b/source/documentation/emulating-multiple-entrypoints.html.md @@ -0,0 +1,238 @@ +--- +title: Emulating Multiple Entrypoints +description: How to emulate multiple entrypoints using existing Aparapi APIs +--- + +##Emulating Multiple Entrypoints Using Existing Aparapi APIs + +Until we have support for multiple entrypoints in Aparapi, there are some tricks for emulating this feature. + +Suppose we wanted to create a general VectorMath kernel which might expose unary square, squareroot methods and binary addition and subtraction functionality. With our current API limitations we can't easily do this, we can approximate having separate methods by passing a separate arg to dictate the 'function' that we wish to perform. + +```java + +class VectorKernel extends Kernel{ + float[] lhsOperand; + float[] rhsOperand; + float[] unaryOperand; + float[] result; + final static int FUNC_ADD =0; + final static int FUNC_SUB =1; + final static int FUNC_SQR =2; + final static int FUNC_SQRT =3; + // other functions + int function; + @Override public void run(){ + int gid = getGlobalId(0){ + if (function==FUNC_ADD){ + result[gid]=lhsOperand[gid]+rhsOperand[gid]; + }else if (function==FUNC_SUB){ + result[gid]=lhsOperand[gid]-rhsOperand[gid]; + }else if (function==FUNC_SQR){ + result[gid]=unaryOperand[gid]*unaryOperand[gid]; + }else if (function==FUNC_ADD){ + result[gid]=sqrt(unaryOperand[gid]); + }else if .... + } +} +``` + +To use this for adding two vectors and then take the sqrt of the result we would use something like.... + +```java + +int SIZE=1024; +Range range = Range.create(SIZE); +VectorKernel vk = new VectorKernel(); +vk.lhsOperand = new float[SIZE]; +vk.rhsOperand = new float[SIZE]; +vk.unaryOperand = new float[SIZE]; +vk.result = new float[SIZE]; + +// fill lhsOperand ommitted +// fill rhsOperand ommitted +vk.function = VectorKernel.FUNC_ADD; +vk.execute(range); +System.arrayCopy(vk.result, 0, vk.unaryOperand, 0, SIZE); +vk.function = VectorKernel.FUNC_SQRT; +vk.execute(range); +``` + +This approach is fairly common and I have used it successfully to perform various pipeline stages for calculating FFT's for example. Whilst this is functional it is not a great solution. First the API is clumsy. We have to mutate the state of the kernel instance and then re-arrange the arrays manually to chain math operations. We could of course hide all of this behind helper methods. One could imagine for example an implementation which exposes helper add(lhs, rhs)}}, or {{{sqrt() which hid all the nasty stuff. + +```java + +class VectorKernel extends Kernel{ + float[] lhsOperand; + float[] rhsOperand; + float[] unaryOperand; + float[] result; + final static int FUNC_ADD =0; + final static int FUNC_SUB =1; + final static int FUNC_SQR =2; + final static int FUNC_SQRT =3; + // other functions + int function; + @Override public void run(){ + int gid = getGlobalId(0){ + if (function==FUNC_ADD){ + result[gid]=lhsOperand[gid]+rhsOperand[gid]; + }else if (function==FUNC_SUB){ + result[gid]=lhsOperand[gid]-rhsOperand[gid]; + }else if (function==FUNC_SQR){ + result[gid]=unaryOperand[gid]*unaryOperand[gid]; + }else if (function==FUNC_ADD){ + result[gid]=sqrt(unaryOperand[gid]); + }else if .... + } + private void binary(int operator, float[] lhs, float[] rhs){ + lhsOperand = lhs; + rhsOperand = rhs; + function=operator; + execute(lhs.length()); + } + public void add(float[] lhs, float[] rhs){ + binary(FUNC_ADD, lhs, rhs); + } + + public void sub(float[] lhs, float[] rhs){ + binary(FUNC_SUB, lhs, rhs); + } + + private void binary(int operator, float[] rhs){ + System.arrayCopy(result, 0, lhsOperand, result.length); + rhsOperand = rhs; + function=operator; + execute(lhsOperand.legth()); + } + + public void add(float[] rhs){ + binary(FUNC_ADD, rhs); + } + + public void sub( float[] rhs){ + binary(FUNC_SUB, rhs); + } + + private void unary(int operator, float[] unary){ + unaryOperand = unary; + function=operator; + execute(unaryOperand.length()); + } + + public void sqrt(float[] unary){ + unary(FUNC_SQRT, unary); + } + + private void unary(int operator){ + System.array.copy(result, 0, unaryOperand, 0, result.length); + function=operator; + execute(unaryOperand.length()); + } + + public void sqrt(){ + unary(FUNC_SQRT); + } + +} + +VectorKernel vk = new VectorKernel(SIZE); +vk.add(copyLhs, copyRhs); // copies args to lhs and rhs operands + // sets function type + // and executes kernel +vk.sqrt(); // because we have no arg + // copies result to unary operand + // sets function type + // execute kernel +``` + +However there is one more objection to this approach, namely that it by default will force unnecessarily buffer copies. + +When the bytecode for the above `Kernel.run()` method is analyzed Aparapi finds bytecode reading from lhsOperand, rhsOperand and unaryOperand arrays/buffers. Obviously at this bytecode analysis stage we can't predict which 'function type' will be used, so on every executions (Kernel.run()) Aparapi must copy all three buffers to the GPU. For binary operations this is one buffer copy wasted (the unaryOperand), for the unary operations we copy two buffers unnecessarily (lhsOperand and rhsOperand). We can of course use explicit buffer management to help us reduce these costs. Ideally we add this to our helper methods. + +```java + +class VectorKernel extends Kernel{ + float[] lhsOperand; + float[] rhsOperand; + float[] unaryOperand; + float[] result; + final static int FUNC_ADD =0; + final static int FUNC_SUB =1; + final static int FUNC_SQR =2; + final static int FUNC_SQRT =3; + // other functions + int function; + @Override public void run(){ + int gid = getGlobalId(0){ + if (function==FUNC_ADD){ + result[gid]=lhsOperand[gid]+rhsOperand[gid]; + }else if (function==FUNC_SUB){ + result[gid]=lhsOperand[gid]-rhsOperand[gid]; + }else if (function==FUNC_SQR){ + result[gid]=unaryOperand[gid]*unaryOperand[gid]; + }else if (function==FUNC_ADD){ + result[gid]=sqrt(unaryOperand[gid]); + }else if .... + } + private void binary(int operator, float[] lhs, float[] rhs){ + lhsOperand = lhs; + rhsOperand = rhs; + function=operator; + put(lhsOperand).put(rhsOperand); + execute(lhs.length()); + get(result); + } + public void add(float[] lhs, float[] rhs){ + binary(FUNC_ADD, lhs, rhs); + } + + public void sub(float[] lhs, float[] rhs){ + binary(FUNC_SUB, lhs, rhs); + } + + private void binary(int operator, float[] rhs){ + System.arrayCopy(result, 0, lhsOperand, result.length); + rhsOperand = rhs; + function=operator; + put(lhsOperand).put(rhsOperand); + execute(lhsOperand.legth()); + get(result); + } + + public void add(float[] rhs){ + binary(FUNC_ADD, rhs); + } + + public void sub( float[] rhs){ + binary(FUNC_SUB, rhs); + } + + private void unary(int operator, float[] unary){ + unaryOperand = unary; + function=operator; + put(unaryOperand); + execute(unaryOperand.length()); + get(result); + } + + public void sqrt(float[] unary){ + unary(FUNC_SQRT, unary); + } + + private void unary(int operator){ + System.array.copy(result, 0, unaryOperand, 0, result.length); + function=operator; + put(unaryOperand); + execute(unaryOperand.length()); + get(result); + + } + + public void sqrt(){ + unary(FUNC_SQRT); + } + +} +``` + diff --git a/source/documentation/ExplicitBufferHandling.html.md b/source/documentation/explicit-buffer-handling.html.md similarity index 57% rename from source/documentation/ExplicitBufferHandling.html.md rename to source/documentation/explicit-buffer-handling.html.md index c901ed91b98f4e4917ad8fe3f84170fa9683179a..e6655352a0d92cd435b85f923c1c851c7389c052 100644 --- a/source/documentation/ExplicitBufferHandling.html.md +++ b/source/documentation/explicit-buffer-handling.html.md @@ -1,21 +1,24 @@ --- - title: ExplicitBufferHandling +title: ExplicitBufferHandling +description: How to minimize buffer transfers. --- -*How to minimize buffer transfers Updated Jul 24, 2012 by frost.g...@gmail.com* Aparapi is designed to shield the Java developer from dealing with the underlying movement of data between the OpenCL host and device. Aparapi can analyze a kernel's `run()` method and run-reachable methods to determine which primitive arrays to transfer to the GPU prior to execution, and which arrays to transfer back when the GPU execution is complete. Generally this strategy is both clean and performant. Aparapi will attempt to just do the right thing. However, occasionally the following code pattern is seen. - final int[] hugeArray = new int[HUGE]; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray - }; - for (int loop=0; loop <MAXLOOP; loop++){ - kernel.execute(HUGE); - } +```java + +final int[] hugeArray = new int[HUGE]; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray +}; +for (int loop=0; loop <MAXLOOP; loop++){ + kernel.execute(HUGE); +} +``` This is a common pattern which unfortunately exposes an issue with Aparapi's normal buffer handling. @@ -23,41 +26,50 @@ Although Aparapi does analyze the byte code of the `Kernel.run()` method (and an Here we add comments to indicate where the unnecessary buffer transfers take place. - final int[] hugeArray = new int[HUGE]; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray - }; - for (int loop=0; loop <MAXLOOP; loop++){ - // copy hugeArray to GPU - kernel.execute(HUGE); - // copy hugeArray back from the GPU - } +```java + +final int[] hugeArray = new int[HUGE]; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray +}; +for (int loop=0; loop <MAXLOOP; loop++){ + // copy hugeArray to GPU + kernel.execute(HUGE); + // copy hugeArray back from the GPU +} +``` In reality hugeArray only needs to be copied to the GPU once (prior to the loop) and then once again when the loop has terminated. Here we use comments to indicated the 'optimal' transfers. - final int[] hugeArray = new int[HUGE]; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray - }; - // Ideally transfer hugeArray to GPU here - for (int loop=0; loop <MAXLOOP; loop++){ - kernel.execute(HUGE); - } - // Ideally transfer hugeArray back from GPU here +```java + +final int[] hugeArray = new int[HUGE]; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray +}; +// Ideally transfer hugeArray to GPU here +for (int loop=0; loop <MAXLOOP; loop++){ + kernel.execute(HUGE); +} +// Ideally transfer hugeArray back from GPU here +``` Consider another common pattern - final int[] hugeArray = new int[HUGE]; - final int[] done = new int[]{0}; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray and writes to done[0] when complete - }; - done[0]=0; - while (done[0] ==0)){ - kernel.execute(HUGE); - } +```java + +final int[] hugeArray = new int[HUGE]; +final int[] done = new int[]{0}; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray and writes to done[0] when complete +}; +done[0]=0; +while (done[0] ==0)){ + kernel.execute(HUGE); +} +``` This is a common pattern in reduce stages of map-reduce type problems. Essentially the developer wants to keep executing a kernel until some condition is met. For example, this may be seen in bitonic sort implementations and various financial applications. @@ -67,19 +79,22 @@ As we demonstrated above, by default Aparapi will transfer `done[]` and `hugeArr To demonstrate which buffers are being transfered, these copies are shown as comments in the following version of the code. - final int[] hugeArray = new int[HUGE]; - final int[] done = new int[]{0}; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray and writes to done[0] when complete - }; - done[0]=0; - while (done[0] ==0)){ - // Send done[] to GPU - // Send hugeArray[] to GPU - kernel.execute(HUGE); - // Fetch done[] from GPU - // Fetch hugeArray[] from GPU - } +```java + +final int[] hugeArray = new int[HUGE]; +final int[] done = new int[]{0}; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray and writes to done[0] when complete +}; +done[0]=0; +while (done[0] ==0)){ + // Send done[] to GPU + // Send hugeArray[] to GPU + kernel.execute(HUGE); + // Fetch done[] from GPU + // Fetch hugeArray[] from GPU +} +``` Further analysis of the code reveals that `hugeArray[]` is not accessed by the loop containing the kernel execution, so Aparapi is performing 999 unnecessary transfers to the device and 999 unnecessary transfers back. Only two transfers of `hugeArray[]` are needed; one to move the initial data to the GPU and one to move it back after the loop terminates. @@ -93,68 +108,83 @@ To use this feature first the developer needs to 'turn on' explicit mode, using The following code illustrates the use of these new explicit buffer management APIs. - final int[] hugeArray = new int[HUGE]; - final int[] done = new int[]{0}; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray and writes to done[0] when complete - }; - kernel.setExplicit(true); - done[0]=0; - kernel.put(done); - kernel.put(hugeArray); - while (done[0] ==0)){ - kernel.execute(HUGE); - kernel.get(done); - } - kernel.get(hugeArray); +```java + +final int[] hugeArray = new int[HUGE]; +final int[] done = new int[]{0}; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray and writes to done[0] when complete +}; +kernel.setExplicit(true); +done[0]=0; +kernel.put(done); +kernel.put(hugeArray); +while (done[0] ==0)){ + kernel.execute(HUGE); + kernel.get(done); +} +kernel.get(hugeArray); +``` Note that marking a kernel as explicit and failing to request the appropriate transfer is a programmer error. We deliberately made `Kernel.put(...)`, `Kernel.get(...)` and `Kernel.execute(range)` return an instance of the executing kernel to allow these calls be chained. Some may find this fluent style API more expressive. - final int[] hugeArray = new int[HUGE]; - final int[] done = new int[]{0}; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray and writes to done[0] when complete - }; - kernel.setExplicit(true); - done[0]=0; - kernel.put(done).put(hugeArray); // chained puts - while (done[0] ==0)){ - kernel.execute(HUGE).get(done); // chained execute and put - } - kernel.get(hugeArray); +```java + +final int[] hugeArray = new int[HUGE]; +final int[] done = new int[]{0}; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray and writes to done[0] when complete +}; +kernel.setExplicit(true); +done[0]=0; +kernel.put(done).put(hugeArray); // chained puts +while (done[0] ==0)){ + kernel.execute(HUGE).get(done); // chained execute and put +} +kernel.get(hugeArray); +``` An alternate approach for loops containing a single `kernel.execute(range)` call. One variant of code which would normally suggest the use of Explicit Buffer Management can be handled differently. For cases where `Kernel.execute(range)` is the sole statement inside a loop and where the iteration count is known prior to the first iteration we offer an alternate (hopefully more elegant) way of minimizing buffer transfers. So for cases like:- - final int[] hugeArray = new int[HUGE]; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray - }; +```java - for (int pass=0; pass<1000; pass++){ - kernel.execute(HUGE); - } +final int[] hugeArray = new int[HUGE]; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray +}; + +for (int pass=0; pass<1000; pass++){ + kernel.execute(HUGE); +} +``` The developer can request that Aparapi perform the outer loop rather than coding the loop. This is achieved explicitly by passing the iteration count as the second argument to `Kernel.execute(range, iterations)`. Now any form of code that looks like :- - int range = 1024; - int loopCount = 64; - for (int passId = 0; passId < loopCount; passId++){ - kernel.execute(range); - } +```java + +int range = 1024; +int loopCount = 64; +for (int passId = 0; passId < loopCount; passId++){ + kernel.execute(range); +} +``` Can be replaced with - int range = 1024; - int loopCount = 64; +```java - kernel.execute(range, loopCount); +int range = 1024; +int loopCount = 64; + +kernel.execute(range, loopCount); +``` Not only does this make the code more compact and avoids the use of explicit buffer management APIs, it allows Aparapi visibility to the complete loop so that Aparapi can minimize the number of transfers. Aparapi will only transfer buffers to the GPU once and transfer them back once, resulting in improved performance. @@ -162,62 +192,73 @@ Sometimes kernel code using this loop-pattern needs to track the current iterati The code for this would have looked something like - int range = 1024; - int loopCount = 64; - final int[] hugeArray = new int[HUGE]; - final int[] passId = new int[0]; - Kernel kernel = new Kernel(){ - @Override public void run(){ - int id=getGlobalId(); - if (passId[0] == 0){ - // perform some initialization! - } - ... // reads/writes hugeArray - } - }; - Kernel.setExplicit(true); - kernel.put(hugeArray); - for (passId[0]=0; passId[0]<loopCount; passId[0]++){ - - kernel.put(passId).execute(range); - } +```java + +int range = 1024; +int loopCount = 64; +final int[] hugeArray = new int[HUGE]; +final int[] passId = new int[0]; +Kernel kernel = new Kernel(){ + @Override public void run(){ + int id=getGlobalId(); + if (passId[0] == 0){ + // perform some initialization! + } + ... // reads/writes hugeArray + } +}; +Kernel.setExplicit(true); +kernel.put(hugeArray); +for (passId[0]=0; passId[0]<loopCount; passId[0]++){ + + kernel.put(passId).execute(range); +} +``` + In the current version of Aparapi we added `Kernel.getPassId()` to allow a Kernel to determine the current ‘pass’ through the outer loop without having to use explicit buffer management. So the previous code can now be written without any explicit buffer management APIs:- - final int[] hugeArray = new int[HUGE]; - final int[] pass[] = new int[]{0}; - Kernel kernel = new Kernel(){ - @Override public void run(){ - int id = getGlobalId(); - int pass = getPassId(); - if (pass == 0){ - // perform some initialization! - } - ... // reads/writes both hugeArray - } - }; - - kernel.execute(HUGE, 1000); +```java + +final int[] hugeArray = new int[HUGE]; +final int[] pass[] = new int[]{0}; +Kernel kernel = new Kernel(){ + @Override public void run(){ + int id = getGlobalId(); + int pass = getPassId(); + if (pass == 0){ + // perform some initialization! + } + ... // reads/writes both hugeArray + } +}; + +kernel.execute(HUGE, 1000); +``` + One common use for Kernel.getPassId() is to avoid flipping buffers in the outer loop. It is common for kernels to process data from one buffer to another, and in the next invocation process the data back the other way. Now these kernels can use the passId (odd or even) to determine the direction of data transfer. - final int[] arr1 = new int[HUGE]; - final int[] arr2 = new int[HUGE]; - Kernel kernel = new Kernel(){ - int f(int v){ … } +```java + +final int[] arr1 = new int[HUGE]; +final int[] arr2 = new int[HUGE]; +Kernel kernel = new Kernel(){ + int f(int v){ … } - @Override public void run(){ - int id = getGlobalId(); - int pass = getPassId(); - if (pass % 2 == 0){ - arr1[id] = f(arr2[id]); - }else{ - arr2[id] = f(arr1[id]); + @Override public void run(){ + int id = getGlobalId(); + int pass = getPassId(); + if (pass % 2 == 0){ + arr1[id] = f(arr2[id]); + }else{ + arr2[id] = f(arr1[id]); - } - } - }; + } + } +}; - kernel.execute(HUGE, 1000); \ No newline at end of file +kernel.execute(HUGE, 1000); +``` \ No newline at end of file diff --git a/source/documentation/HSAEnablementOfLambdaBranch.html.md b/source/documentation/hsa-enabled-lambda.html.md similarity index 80% rename from source/documentation/HSAEnablementOfLambdaBranch.html.md rename to source/documentation/hsa-enabled-lambda.html.md index 43d01600deb0afdbedace4433e8e8873367f4800..f07a2de17d5bb120a0ad358c78cd6c749a4cf13c 100644 --- a/source/documentation/HSAEnablementOfLambdaBranch.html.md +++ b/source/documentation/hsa-enabled-lambda.html.md @@ -1,11 +1,10 @@ --- - title: HSAEnablementOfLambdaBranch +title: HSA Enabled Lambda +description: Adding HSA Support to Aparapi lambda branch. --- -*Adding HSA Support to Aparapi lambda branch Updated Feb 28, 2014 by frost.g...@gmail.com* - -* [How to setup a HSA enabled Linux Platform](SettingUpLinuxHSAMachineForAparapi.md) -* [How to setup a HSA simulator on a Linux Platform](UsingAparapiLambdaBranchWithHSASimulator.md) +* [How to setup a HSA enabled Linux Platform](/documentation/setting-up-hsa.html) +* [How to setup a HSA simulator on a Linux Platform](/documenation/using-hsa-simulator.html) Recently the HSA Foundation released their ‘Programmers Reference Manual’. This manual is for developers wishing to write code for upcoming HSA compatible devices, it describes the HSA Intermediate Language (HSAIL) along with its binary form (BRIG) and describes how code is expected to execute on a HSA enabled devices. @@ -25,11 +24,15 @@ In the existing code (early prototype) we provide access to HSA as a specific de So our ubiquitous 'squares' example will initially be written as: - int in[] = ..// - int out[] = .../ - Device.hsa().forEach(in.length, (i)->{ - out[i] = in[i]*in[i]; - }); +```java + +int in[] = ..// +int out[] = .../ +Device.hsa().forEach(in.length, (i)->{ + out[i] = in[i]*in[i]; + }); +``` + You will obviously need a Java 8 compatible JDK ([https://jdk8.java.net/download.html](https://jdk8.java.net/download.html)) in your path. We also recommend using IntelliJ which has preliminary support for Java 8 lambda features. You can download the community edition of IntelliJ from [http://www.jetbrains.com/idea/](http://www.jetbrains.com/idea/) \ No newline at end of file diff --git a/source/documentation/JavaKernelGuidelines.html.md b/source/documentation/kernel-guidelines.html.md similarity index 91% rename from source/documentation/JavaKernelGuidelines.html.md rename to source/documentation/kernel-guidelines.html.md index 5b2978f02c07346f2bc2dece6d32471bd16aebd9..587ced37584d7272b617da73801901c87cc618b1 100644 --- a/source/documentation/JavaKernelGuidelines.html.md +++ b/source/documentation/kernel-guidelines.html.md @@ -1,8 +1,8 @@ --- - title: JavaKernelGuidelines +title: Kernel Guidelines +description: What code can and can't be converted to OpenCL by Aparapi. --- -*What code can and can't be converted to OpenCL by Aparapi. Updated Sep 13, 2011 by frost.g...@gmail.com* ##Aparapi Java Kernel Guidelines Certain practices can improve the chances of your Java kernel being converted to OpenCL and executing on a GPU. @@ -46,14 +46,16 @@ Some restrictions/guidelines may be removed or augmented in a future Aparapi rel * A variable cannot have its first assignment be the side effect of an expression evaluation or a method call. For example, the following will not be translated to run on the GPU. - int foo(int a) { - // . . . - } - public void run() { - int z; - foo(z = 3); - } +```java +int foo(int a) { + // . . . +} +public void run() { + int z; + foo(z = 3); +} +``` * This should be regarded as an error which needs to be addressed, as a workaround, explicitly initialize variables (even to 0) when declared. @@ -61,15 +63,22 @@ Some restrictions/guidelines may be removed or augmented in a future Aparapi rel OpenCL is C99-based and as such the result of expressions depending on side effects of other expressions can differ from what one might expect from Java, please avoid using code that assumes Java's tighter rules. Generally code should be as simple as possible. For example, although Java explicitly defines - arra[i++] = arrb[i++]; +```java + +arra[i++] = arrb[i++]; +``` + to be equivalent to - arra[i] = arrb[i+1]; - i += 2; +```java + +arra[i] = arrb[i+1]; +i += 2; +``` The C99/OpenCL standard does not define this and so the result would be undefined. ##Runtime Exceptions -* When run on the GPU, array accesses will not generate an ArrayIndexOutOfBoundsException. Instead the behavior will be unspecified. +* When run on the GPU, array accesses will not generate an ArrayIndexOutOfBoundsException. Instead the behavior will be unspecified. * When run on the GPU, ArithmeticExceptions will not be generated, for example with integer division by zero. Instead the behavior will be unspecified. Attribution diff --git a/source/documentation/LIbraryAgentDuality.html.md b/source/documentation/library-agent-duality.html.md similarity index 83% rename from source/documentation/LIbraryAgentDuality.html.md rename to source/documentation/library-agent-duality.html.md index 7fca3ef7160b8442acad8d8cec71bffc811d8c49..66e8a34c116eb67d57abf2991ed301708d04b441 100644 --- a/source/documentation/LIbraryAgentDuality.html.md +++ b/source/documentation/library-agent-duality.html.md @@ -1,18 +1,23 @@ --- - title: LIbraryAgentDuality +title: Library Agent Duality +description: Aparapi libraries can be loaded as JVMTI agents. --- -*Aparapi libraries can now be loaded as JVMTI agents. Updated Jan 15, 2013 by frost.g...@gmail.com* - ##What are all these check-ins referring to JVMTI agents? If you have been tracking Aparapi SVN checkins you will have noticed a bunch of changes to JNI code. I just finished arranging for aparapi libraries (.dll or .so) to be able to be loaded as JVMTI agent. Now (assuming library is in ${APARAPI_DIR}) we can either launch using the traditional... - java –Djava.library.path=${APARAPI_DIR} –classpath ${APARAPI_DIR}/aparapi.jar;my.jar mypackage.MyClass +```bash + +java –Djava.library.path=${APARAPI_DIR} –classpath ${APARAPI_DIR}/aparapi.jar;my.jar mypackage.MyClass +``` or ... - java –agentpath=${APARAPI_DIR}/aparapi_x86_64.dll –classpath ${APARAPI_DIR}/aparapi.jar;my.jar mypackage.MyClass +```bash + +java –agentpath=${APARAPI_DIR}/aparapi_x86_64.dll –classpath ${APARAPI_DIR}/aparapi.jar;my.jar mypackage.MyClass +``` So the dll/so is now both ‘just a library’ and a JVMTI agent. diff --git a/source/documentation/UsingLocalMemory.html.md b/source/documentation/local-memory.html.md similarity index 50% rename from source/documentation/UsingLocalMemory.html.md rename to source/documentation/local-memory.html.md index a751868bc79c984ef8c14bee86630c1f8affc659..59dbe4d5431c335b09202afc7b0c4a8d82c9c3d6 100644 --- a/source/documentation/UsingLocalMemory.html.md +++ b/source/documentation/local-memory.html.md @@ -1,9 +1,10 @@ --- - title: UsingLocalMemory +title: Local Memory +description: How to make use of local memory in a Kernel. --- -*How to make use of local memory in a Kernel Updated Feb 28, 2012 by frost.g...@gmail.com* ##How to make use of new local memory feature + By default all primitive arrays accessed by an Aparapi Kernel is considered global. If we look at the generated code using -Dcom.aparapi.enableShowGeneratedOpenCL=true we will see that primitive arrays (such as int buf[]) are mapped to __global pointers (such as __global int *buf) in OpenCL. Although this makes Aparapi easy to use (especially to Java developers who are unfamiliar to tiered memory hierarchies), it does limit the ability of the 'power developer' wanting to extract more performance from Aparapi on the GPU. @@ -19,84 +20,110 @@ Local memory is shared by all work item's (kernel instances) executing in the sa Aparapi only supports local arrays, not scalers. ##How to define a primitive array as "local" + We have two ways define a local buffer. Either we can decorate the variable name with a _$local$ suffix (yes it is a valid identifier n Java). - final int[] buffer = new int[1024]; // this is global accessable to all work items. - final int[] buffer_$local$ = new int[1024]; // this is a local buffer 1024 int's shared across all work item's in a group +```java + +final int[] buffer = new int[1024]; // this is global accessable to all work items. +final int[] buffer_$local$ = new int[1024]; // this is a local buffer 1024 int's shared across all work item's in a group - Kernel k = new Kernel(){ - public void run(){ - // access buffer - // access buffer_$local$ - localBarrier(); // allows all writes to buffer_$local$ to be synchronized across all work items in this group - // .... - } +Kernel k = new Kernel(){ + public void run(){ + // access buffer + // access buffer_$local$ + localBarrier(); // allows all writes to buffer_$local$ to be synchronized across all work items in this group + // .... } +} +``` + Alternatively (if defining inside the derived Kernel class - cannot be used via anonymous inner class pattern above!) we can can use the @Local annotation. - final int[] buffer = new int[1024]; // this is global accessable to all work items. +```java - Kernel k = new Kernel(){ - @Local int[] localBuffer = new int[1024]; // this is a local buffer 1024 int's shared across all work item's in a group - public void run(){ - // access buffer - // access localBuffer - localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group - // .... - } +final int[] buffer = new int[1024]; // this is global accessable to all work items. + +Kernel k = new Kernel(){ + @Local int[] localBuffer = new int[1024]; // this is a local buffer 1024 int's shared across all work item's in a group + public void run(){ + // access buffer + // access localBuffer + localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group + // .... } +} +``` + ##How do I know how big to make my local buffer? This is where the new Range class helps. If we create a Range using: - Range rangeWithUndefinedGroupSize = Range.create(1024); +```java + +Range rangeWithUndefinedGroupSize = Range.create(1024); +``` + The Aparapi will pick a suitable group size. Generally this will be the highest factor of global size <= 256. So for a global size which is a power of two (and greater or equal to256 ;) ) the group size will be 256. Normally the size a local buffer will be some ratio of the group size. So if we needed 4 ints per group we might use a sequence such as. - final int[] buffer = new int[8192]; // this is global accessable to all work items. - final Range range = Range.create(buffer.length); // let the runtime pick the group size - - Kernel k = new Kernel(){ - @Local int[] localBuffer = new int[range.getLocalSize(0)*4]; // this is a local buffer containing 4 ints per work item in the group - public void run(){ - // access buffer - // access localBuffer - localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group - // .... - } +```java + +final int[] buffer = new int[8192]; // this is global accessable to all work items. +final Range range = Range.create(buffer.length); // let the runtime pick the group size + +Kernel k = new Kernel(){ + @Local int[] localBuffer = new int[range.getLocalSize(0)*4]; // this is a local buffer containing 4 ints per work item in the group + public void run(){ + // access buffer + // access localBuffer + localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group + // .... } +} +``` + Alternatively you can of course specify your own group size when you create the Range. - final int[] buffer = new int[8192]; // this is global accessable to all work items. - final Range range = Range.create(buffer.length,16); // we requested a group size of 16 - - Kernel k = new Kernel(){ - @Local int[] localBuffer = new int[range.getLocalSize(0)*4]; // this is a local buffer containing 4 ints per work item in the group = 64 ints - public void run(){ - // access buffer - // access localBuffer - localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group - // .... - } +```java + +final int[] buffer = new int[8192]; // this is global accessable to all work items. +final Range range = Range.create(buffer.length,16); // we requested a group size of 16 + +Kernel k = new Kernel(){ + @Local int[] localBuffer = new int[range.getLocalSize(0)*4]; // this is a local buffer containing 4 ints per work item in the group = 64 ints + public void run(){ + // access buffer + // access localBuffer + localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group + // .... } +} +``` + ##Using barriers + As we mentioned above local memory buffers are shared by all work items/kernels executing in the same group. However, to read a value written by another workitem we need to insert a local barrier. A common pattern involves having each work item copying a value from global memory in local memory. - Kernel k = new Kernel(){ - @Local int[] localBuffer = new int[range.getLocalSize(0)]; - public void run(){ +```java + +Kernel k = new Kernel(){ + @Local int[] localBuffer = new int[range.getLocalSize(0)]; + public void run(){ - localBuffer[getLocalId(0)] = globalBuffer[getGlobalId(0)]; - localBarrier(); // after this all kernels can see the data copied by other workitems in this group - // use localBuffer[0..getLocalSize(0)] - } + localBuffer[getLocalId(0)] = globalBuffer[getGlobalId(0)]; + localBarrier(); // after this all kernels can see the data copied by other workitems in this group + // use localBuffer[0..getLocalSize(0)] } +} +``` + Without the barrier above, there is no guarantee that other work items will see mutations to localBuffer from other work items. Caution regarding barriers @@ -104,70 +131,90 @@ Barriers can be dangerous. It is up to the developer to ensure that all kernels The following kernel will deadlock! - Kernel kernel = new Kernel(){ - public void run(){ - if (getGlobalId(0)>10){ - // ... - localBarrier(); - // ... - } - } +```java + +Kernel kernel = new Kernel(){ + public void run(){ + if (getGlobalId(0)>10){ + // ... + localBarrier(); + // ... + } } +} +``` + We need to make sure that all kernel's in a group execute the localBarrier(). So the following will work. - Kernel kernel = new Kernel(){ - public void run(){ - if (getGlobalId(0)>10){ - // ... - localBarrier(); - // ... - }else{ - localBarrier(); - } - - } +```java + +Kernel kernel = new Kernel(){ + public void run(){ + if (getGlobalId(0)>10){ + // ... + localBarrier(); + // ... + }else{ + localBarrier(); + } + } +} +``` + Of course if we have multiple calls to localBarrier() in the 'if' side of the if..then then we must match in the 'else'. - Kernel kernel = new Kernel(){ - public void run(){ - if (getGlobalId(0)>10){ - // ... - localBarrier(); - // ... - localBarrier(); - // ... - }else{ - localBarrier(); - localBarrier(); - } - - } +```java + +Kernel kernel = new Kernel(){ + public void run(){ + if (getGlobalId(0)>10){ + // ... + localBarrier(); + // ... + localBarrier(); + // ... + }else{ + localBarrier(); + localBarrier(); + } + } +} +``` + With loops we must make sure that each kernel processes any loop the sam e # of times. So the following is fine. - Kernel kernel = new Kernel(){ - public void run(){ - for (int i=0; i< 10; i++){ - // ... - localBarrier(); - // ... - } - } +```java + +Kernel kernel = new Kernel(){ + public void run(){ + for (int i=0; i< 10; i++){ + // ... + localBarrier(); + // ... + } } +} +``` + However the following will deadlock - Kernel kernel = new Kernel(){ - public void run(){ - for (int i=0; i< getLocalId(0); i++){ - // ... - localBarrier(); - // ... - } - } +```java + +Kernel kernel = new Kernel(){ + public void run(){ + for (int i=0; i< getLocalId(0); i++){ + // ... + localBarrier(); + // ... + } } +} +``` + As a testament to how well we emulate OpenCL in JTP mode, this will also deadlock your kernel in JTP mode ;) so be careful. Performance impact in JTP mode @@ -178,6 +225,5 @@ We do honor localBarrier() using Java's barrier from the new concurrency utils. I would recommend using local memory and barriers only if I am 90% sure the code will run on the GPU. ##Can I see some code? -I added a version of NBody example which uses local memory, the source can be found here. -[http://code.google.com/p/aparapi/source/browse/trunk/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java](http://code.google.com/p/aparapi/source/browse/trunk/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java) \ No newline at end of file +There is a version of NBody example which uses local memory, the source can be [found here](https://github.com/Syncleus/aparapi-examples/blob/master/src/main/java/com/aparapi/examples/nbody/Local.java). \ No newline at end of file diff --git a/source/documentation/UsingMultiDimExecutionRanges.html.md b/source/documentation/multiple-dim-ranges.html.md similarity index 58% rename from source/documentation/UsingMultiDimExecutionRanges.html.md rename to source/documentation/multiple-dim-ranges.html.md index 7e5ec2a94326a6324db972e0fbc1237b06fee568..067fbf2d919cfe4123c45c30b28808d6873d9940 100644 --- a/source/documentation/UsingMultiDimExecutionRanges.html.md +++ b/source/documentation/multiple-dim-ranges.html.md @@ -1,9 +1,8 @@ --- - title: UsingMultiDimExecutionRanges +title: Multiple Dim Ranges +description: How to use the Range class (for multi-dim range access). --- -*How to use the new Range class (for multi-dim range access) Updated Feb 13, 2012 by frost.g...@gmail.com* - Aparapi now allows developers to execute over one, two or three dimensional ranges. OpenCL natively allows the user to execute over 1, 2 or 3 dimension grids via the clEnqueueNDRangeKernel() method. Initially we chose not to expose 2D or 3D ranges (Aparapi's Kernel.execute(range) allowed only !d ranges, but following a specific request we added the notion of a Range via the new com.aparapi.Range class. @@ -25,39 +24,46 @@ The above represents a 2D grid of execution 32 rows by 32 columns. In this case If we wish to specify the groupsize (say 4x4) then we can use. - Range range = Range.create2D(32, 32, 4, 4); - This example uses a 2D range to apply a blurring convolution effect to a pixel buffer. - - final static int WIDTH=128; - final static int HEIGHT=64; - final int in[] = new int[WIDTH*HEIGHT]; - final int out[] = new int[WIDTH*HEIGHT]; - Kernel kernel = new Kernel(){ - public void run(){ - int x = getGlobalId(0); - int y = getGlobalId(1); - if (x>0 && x<(getGlobalSize(0)-1) && y>0 && y<(getGlobalSize(0)-1)){ - int sum = 0; - for (int dx =-1; dx<2; dx++){ - for (int dy =-1; dy<2; dy++){ - sum+=in[(y+dy)*getGlobalSize(0)+(x+dx)]; - } - } - out[y*getGlobalSize(0)+x] = sum/9; - } - } - - }; - Range range = Range.create2D(WIDTH, HEIGHT); - kernel.execute(range); +```java + +Range range = Range.create2D(32, 32, 4, 4); +This example uses a 2D range to apply a blurring convolution effect to a pixel buffer. + +final static int WIDTH=128; +final static int HEIGHT=64; +final int in[] = new int[WIDTH*HEIGHT]; +final int out[] = new int[WIDTH*HEIGHT]; +Kernel kernel = new Kernel(){ + public void run(){ + int x = getGlobalId(0); + int y = getGlobalId(1); + if (x>0 && x<(getGlobalSize(0)-1) && y>0 && y<(getGlobalSize(0)-1)){ + int sum = 0; + for (int dx =-1; dx<2; dx++){ + for (int dy =-1; dy<2; dy++){ + sum+=in[(y+dy)*getGlobalSize(0)+(x+dx)]; + } + } + out[y*getGlobalSize(0)+x] = sum/9; + } + } + +}; +Range range = Range.create2D(WIDTH, HEIGHT); +kernel.execute(range); +``` ##Handling this from JTP mode + Mapping to OpenCL for this is all fairly straightforward. In Java JTP mode we have to emulate the execution over the 1D, 2D and 3D ranges using threads. Note that the number of threads we launch is essentially the size of the group. So be careful creating large groups. If we ask for a 3D range using :- - Range range = Range.create3D(1024, 1024, 1024, 8, 8, 8); +```java + +Range range = Range.create3D(1024, 1024, 1024, 8, 8, 8); +``` -We are asking for a group size of 8x8x8 == 512. So we are asking for 512 threads! \ No newline at end of file +We are asking for a group size of `8x8x8 == 512`. So we are asking for 512 threads! \ No newline at end of file diff --git a/source/documentation/NewFeatures.html.md b/source/documentation/new-features.html.md similarity index 58% rename from source/documentation/NewFeatures.html.md rename to source/documentation/new-features.html.md index 8c6f63f0fd0e65bd7a545cb45da37e340ee30202..0f66ec9b5374f5e2c084b9622434d89dc2b90fc8 100644 --- a/source/documentation/NewFeatures.html.md +++ b/source/documentation/new-features.html.md @@ -1,30 +1,38 @@ --- - title: NewFeatures +title: New Features +description: New Features recently added to Aparapi. --- -*New Features added to this open source release of Aparapi. Updated Sep 14, 2011 by frost.g...@gmail.com* ##New Features + Aparapi has two new, especially useful features: Explicit Buffer Management for minimizing buffer transfers Kernel access to objects held in arrays + ###Minimizing Buffer Transfers + ####Explicit Buffer Management + Aparapi is designed to shield the Java developer from dealing with the underlying movement of data between the OpenCL host and device. Aparapi can analyze a kernel's run() method and run-reachable methods to determine which primitive arrays to transfer to the GPU prior to execution, and which arrays to transfer back when the GPU execution is complete. Generally this strategy is both clean and performant. Aparapi will attempt to just do the right thing. However, occasionally the following code pattern is seen. - final int[] hugeArray = new int[HUGE]; - final int[] done = new int[]{0}; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray and writes to done[0] when complete - }; - done[0]=0; - while (done[0] ==0)){ - kernel.execute(HUGE); - } +```java + +final int[] hugeArray = new int[HUGE]; +final int[] done = new int[]{0}; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray and writes to done[0] when complete +}; +done[0]=0; +while (done[0] ==0)){ + kernel.execute(HUGE); +} +``` + This is a common pattern in reduce stages of map-reduce type problems. Essentially the developer wants to keep executing a kernel until some condition is met. For example, this may be seen in bitonic sort implementations and various financial applications. From the code it can be seen that the kernel reads and writes hugeArray[] array and uses the single item done[] array to indicate some form of convergence or completion. @@ -33,19 +41,25 @@ Unfortunately, by default Aparapi will transfer done[] and hugeArray[] to and fr To demonstrate which buffers are being transfered, these copies are shown as comments in the following version of the code. - final int[] hugeArray = new int[HUGE]; - final int[] done = new int[]{0}; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray and writes to done[0] when complete - }; - done[0]=0; - while (done[0] ==0)){ - // Send done[] to GPU - // Send hugeArray[] to GPU - kernel.execute(HUGE); - // Fetch done[] from GPU - // Fetch hugeArray[] from GPU - } +```java + +final int[] hugeArray = new int[HUGE]; +final int[] done = new int[]{0}; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray and writes to done[0] when complete +}; +done[0]=0; +while (done[0] ==0)){ + // Send done[] to GPU + // Send hugeArray[] to GPU + kernel.execute(HUGE); + // Fetch done[] from GPU + // Fetch hugeArray[] from GPU +} +``` + + + Further analysis of the code reveals that hugeArray[] is not accessed by the loop containing the kernel execution, so Aparapi is performing 999 unnecessary transfers to the device and 999 unnecessary transfers back. Only two transfers of hugeArray[] are needed; one to move the initial data to the GPU and one to move it back after the loop terminates. The done[] array is accessed during each iteration (although never written to within the loop), so it does needs to be transferred back for each return from Kernel.execute(), however, it only needs to be sent once. @@ -58,131 +72,164 @@ To use this feature first set the mode to explicit, using the kernel.setExplicit The following code illustrates the use of these new explicit buffer management APIs. - final int[] hugeArray = new int[HUGE]; - final int[] done = new int[]{0}; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray and writes to done[0] when complete - }; - kernel.setExplicit(true); - done[0]=0; - kernel.put(done); - kernel.put(hugeArray); - while (done[0] ==0)){ - kernel.execute(HUGE); - kernel.get(done); - } - kernel.get(hugeArray); +```java + +final int[] hugeArray = new int[HUGE]; +final int[] done = new int[]{0}; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray and writes to done[0] when complete +}; +kernel.setExplicit(true); +done[0]=0; +kernel.put(done); +kernel.put(hugeArray); +while (done[0] ==0)){ + kernel.execute(HUGE); + kernel.get(done); +} +kernel.get(hugeArray); +``` + Note that marking a kernel as explicit and failing to request the appropriate transfer is a programmer error. We deliberately made Kernel.put(…), Kernel.get(…) and Kernel.execute(range) return an instance of the executing kernel to allow these calls be chained. Some may find this fluent style API more expressive. - final int[] hugeArray = new int[HUGE]; - final int[] done = new int[]{0}; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray and writes to done[0] when complete - }; - kernel.setExplicit(true); - done[0]=0; - kernel.put(done).put(hugeArray); // chained puts - while (done[0] ==0)){ - kernel.execute(HUGE).get(done); // chained execute and put - } - kernel.get(hugeArray); +```java + +final int[] hugeArray = new int[HUGE]; +final int[] done = new int[]{0}; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray and writes to done[0] when complete +}; +kernel.setExplicit(true); +done[0]=0; +kernel.put(done).put(hugeArray); // chained puts +while (done[0] ==0)){ + kernel.execute(HUGE).get(done); // chained execute and put +} +kernel.get(hugeArray); +``` + ####An alternate approach for loops containing a single kernel.execute(range) call. + One variant of code which would normally suggest the use of Explicit Buffer Management can be handled differently. For cases where Kernel.execute(range) is the sole statement inside a loop and where the iteration count is known prior to the first iteration we offer an alternate (hopefully more elegant) way of minimizing buffer transfers. So for cases like:- - final int[] hugeArray = new int[HUGE]; - Kernel kernel= new Kernel(){ - ... // reads/writes hugeArray - }; +```java - for (int pass=0; pass<1000; pass++){ - kernel.execute(HUGE); - } +final int[] hugeArray = new int[HUGE]; +Kernel kernel= new Kernel(){ + ... // reads/writes hugeArray +}; + +for (int pass=0; pass<1000; pass++){ + kernel.execute(HUGE); +} +``` + The developer can request that Aparapi perform the outer loop rather than coding the loop. This is achieved explicitly by passing the iteration count as the second argument to Kernel.execute(range, iterations). Now any form of code that looks like :- - int range=1024; - int loopCount=64; - for (int passId=0; passId<loopCount; passId++){ - kernel.execute(range); - } +```java + +int range=1024; +int loopCount=64; +for (int passId=0; passId<loopCount; passId++){ + kernel.execute(range); +} +``` + Can be replaced with - int range=1024; - int loopCount=64; +```java - kernel.execute(range, loopCount); +int range=1024; +int loopCount=64; + +kernel.execute(range, loopCount); +``` + Not only does this make the code more compact and avoids the use of explicit buffer management APIs, it allows Aparapi visibility to the complete loop so that Aparapi can minimize the number of transfers. Aparapi will only transfer buffers to the GPU once and transfer them back once, resulting in improved performance. Sometimes kernel code using this loop-pattern needs to track the current iteration number as the code passed through the outer loop. Previously we would be forced to use explicit buffer management to allow the kernel to do this. The code for this would have looked something like - int range=1024; - int loopCount=64; - final int[] hugeArray = new int[HUGE]; - final int[] passId = new int[0]; - Kernel kernel= new Kernel(){ - @Override public void run(){ - int id=getGlobalId(); - if (passId[0] == 0){ - // perform some initialization! - } - ... // reads/writes hugeArray - } - }; - Kernel.setExplicit(true); - kernel.put(hugeArray); - for (passId[0]=0; passId[0]<loopCount; passId[0]++){ - - kernel.put(passId).execute(range); - } +```java + +int range=1024; +int loopCount=64; +final int[] hugeArray = new int[HUGE]; +final int[] passId = new int[0]; +Kernel kernel= new Kernel(){ + @Override public void run(){ + int id=getGlobalId(); + if (passId[0] == 0){ + // perform some initialization! + } + ... // reads/writes hugeArray + } +}; +Kernel.setExplicit(true); +kernel.put(hugeArray); +for (passId[0]=0; passId[0]<loopCount; passId[0]++){ + + kernel.put(passId).execute(range); +} +``` + In the current version of Aparapi we added Kernel.getPassId() to allow a Kernel to determine the current ‘pass’ through the outer loop without having to use explicit buffer management. So the previous code can now be written without any explicit buffer management APIs:- - final int[] hugeArray = new int[HUGE]; - final int[] pass[] = new int[]{0}; - Kernel kernel= new Kernel(){ - @Override public void run(){ - int id=getGlobalId(); - int pass = getPassId(); - if (pass == 0){ - // perform some initialization! - } - ... // reads/writes both hugeArray - } - }; - - kernel.execute(HUGE, 1000); +```java + +final int[] hugeArray = new int[HUGE]; +final int[] pass[] = new int[]{0}; +Kernel kernel= new Kernel(){ + @Override public void run(){ + int id=getGlobalId(); + int pass = getPassId(); + if (pass == 0){ + // perform some initialization! + } + ... // reads/writes both hugeArray + } +}; +``` + +kernel.execute(HUGE, 1000); + One common use for Kernel.getPassId() is to avoid flipping buffers in the outer loop. It is common for kernels to process data from one buffer to another, and in the next invocation process the data back the other way. Now these kernels can use the passId (odd or even) to determine the direction of data transfer. - final int[] arr1 = new int[HUGE]; - final int[] arr2 = new int[HUGE]; - Kernel kernel= new Kernel(){ - int f(int v){ … } +```java + +final int[] arr1 = new int[HUGE]; +final int[] arr2 = new int[HUGE]; +Kernel kernel= new Kernel(){ + int f(int v){ … } - @Override public void run(){ - int id=getGlobalId(); - int pass = getPassId(); - if (pass%2==0){ - arr1[id] = f(arr2[id]); - }else{ - arr2[id] = f(arr1[id]); + @Override public void run(){ + int id=getGlobalId(); + int pass = getPassId(); + if (pass%2==0){ + arr1[id] = f(arr2[id]); + }else{ + arr2[id] = f(arr1[id]); - } - } - }; + } + } +}; +``` - kernel.execute(HUGE, 1000); +kernel.execute(HUGE, 1000); ####Allow kernels to access simple arrays of objects + Aparapi needs to create OpenCL from the bytecode that it sees. Generally OpenCL constrains us to using parallel primitive arrays (OpenCL allows structs, but Java and OpenCL do not have comparable memory layouts for these structures). Therefore, you will generally need to refactor your code from a classic object-oriented form to use primitive arrays. This incompatibility between data-parallel and object-oriented code patterns might discourage use of Aparapi, so Aparapi includes limited support for arrays of simple Objects. Future versions may well extend this functionality and address performance loss. @@ -191,40 +238,42 @@ Consider the NBody example. Typically, a Java developer writing NBody would probably not separate the x,y and z ordinates into parallel arrays of floats as was required in the previous (alpha) version of Aparapi. Instead, a Java developer would probably create a Body class to hold the state of each body and possibly a Universe class (container of Body instances) with the responsible for positioning and possibly displaying the bodies. - class Body{ - float x,y,z; - float getX(){return x;} - void setX(float _x){ x = _x;} - float getY(){return y;} - void setY(float _y){ y = _y;} - float getZ(){return z;} - void setZ(float _z){ z = _z;} - - - // other data related to Body unused by positioning calculations - } - - class Universe{ - final Body[] bodies; - public Universe(final Body[] _bodies){ - bodies = _bodies; - } - void adjustPositions(){ - for (Body outer:bodies){ - for (Body inner:bodies}{ - // adjust outer position to reflect the effect of inner - // using inner and outer getters and setters for x, y and z - } - } - } - void display(){ - for (Body body:bodies){ - // draw body based on x, y and z using Body getters +```java + +class Body{ + float x,y,z; + float getX(){return x;} + void setX(float _x){ x = _x;} + float getY(){return y;} + void setY(float _y){ y = _y;} + float getZ(){return z;} + void setZ(float _z){ z = _z;} + + + // other data related to Body unused by positioning calculations +} + +class Universe{ + final Body[] bodies; + public Universe(final Body[] _bodies){ + bodies = _bodies; + } + void adjustPositions(){ + for (Body outer:bodies){ + for (Body inner:bodies}{ + // adjust outer position to reflect the effect of inner + // using inner and outer getters and setters for x, y and z } } - } -From the above code we see that the Universe.adjustPositions() method is compute intensive and an ideal candidate for refactoring to use Aparapi. The current version of Aparapi is able to deal with simple arrays of objects like this. - -Now when Aparapi encounters an array of objects and the accesses to these objects are constrained to simple getters and setters, Aparapi will automatically extract the values of the accessed fields into a data parallel form, execute the kernel and then replace the results back in the original objects in the array. This happens on each call to Kernel.execute() and is fairly costly (from a performance point of view), however, for embarrassingly parallel code (such as NBody), we can still show considerable performance gains over standard Java Thread Pool - -Attribution \ No newline at end of file + } + void display(){ + for (Body body:bodies){ + // draw body based on x, y and z using Body getters + } + } +} +``` + +From the above code we see that the `Universe.adjustPositions()` method is compute intensive and an ideal candidate for refactoring to use Aparapi. The current version of Aparapi is able to deal with simple arrays of objects like this. + +Now when Aparapi encounters an array of objects and the accesses to these objects are constrained to simple getters and setters, Aparapi will automatically extract the values of the accessed fields into a data parallel form, execute the kernel and then replace the results back in the original objects in the array. This happens on each call to Kernel.execute() and is fairly costly (from a performance point of view), however, for embarrassingly parallel code (such as NBody), we can still show considerable performance gains over standard Java Thread Pool \ No newline at end of file diff --git a/source/documentation/opencl-bindings.html.md b/source/documentation/opencl-bindings.html.md new file mode 100644 index 0000000000000000000000000000000000000000..4568f566f1281d163563d89ec2eccf2c5c07daba --- /dev/null +++ b/source/documentation/opencl-bindings.html.md @@ -0,0 +1,76 @@ +--- +title: OpenCL Bindings +description: How to use new OpenCL binding mechanism. +--- + +As a step towards the extension mechanism I needed a way to easily bind OpenCL to an interface. + +Here is what I have come up with. We will use the 'Square' example. + +You first define an interface with OpenCL annotations.. + +```java + +interface Squarer extends OpenCL<Squarer>{ +@Kernel("{\n"// + + " const size_t id = get_global_id(0);\n"// + + " out[id] = in[id]*in[id];\n"// + + "}\n")// +public Squarer square(// + Range _range,// + @GlobalReadOnly("in") float[] in,// + @GlobalWriteOnly("out") float[] out); +} +``` + +This describes the API we wish to bind to a set of kernel entrypoints (here we only have one, but we could have many). Then you 'realize' the interface by asking a device to create an implementation of the interface. Device is a new Aparapi class which represents a GPU or CPU OpenCL device. So here we are asking for the first (default) GPU device to realize the interface. + +```java + +Squarer squarer = Device.firstGPU(Squarer.class); +``` + +Now you can call the implementation directly with a Range. + +```java + +squarer.square(Range.create(in.length), in, out); +``` + +I think that we will have the easiest OpenCL binding out there... + +Following some conversations/suggestions online http://a-hackers-craic.blogspot.com/2012/03/aparapi.html we could also offer the ability to provide the OpenCL source from a file/url course using interface level Annotations. + +So we could allow. + +```java + +@OpenCL.Resource("squarer.cl"); +interface Squarer extends OpenCL<Squarer>{ + public Squarer square(// + Range _range,// + @GlobalReadOnly("in") float[] in,// + @GlobalWriteOnly("out") float[] out); +} +``` + +Or if the text is on-hand at compile time in a single constant string + +```java + +@OpenCL.Source("... opencl text here"); +interface Squarer extends OpenCL<Squarer>{ + public Squarer square(// + Range _range,// + @GlobalReadOnly("in") float[] in,// + @GlobalWriteOnly("out") float[] out); +} +``` + +Finally to allow for creation of dynamic OpenCL (good for FFT's of various Radii). + +```java + +String openclSource = ...; +Squarer squarer = Device.firstGPU(Squarer.class, openclSource); +``` \ No newline at end of file diff --git a/source/documentation/PrivateMemorySpace.html.md b/source/documentation/private-memory-space.html.md similarity index 92% rename from source/documentation/PrivateMemorySpace.html.md rename to source/documentation/private-memory-space.html.md index 8ae4fc79856cd470cea9047c70ea6d7b28d2d46d..9ef9142d5d69b3106cdba7e64dbabea3963a98da 100644 --- a/source/documentation/PrivateMemorySpace.html.md +++ b/source/documentation/private-memory-space.html.md @@ -1,9 +1,8 @@ --- - title: PrivateMemorySpace +title: Private Memory Space +description: Using private memory space in Aparapi kernels. --- -*Using `__private` memory space in Aparapi kernels. Phase-Implementation Updated Sep 14, 2014 by barneydp...@gmail.com* - ## Introduction The private memory space identifier (just "private" is also recognised) can be applied to struct fields in order to indicate that the data is not shared with/accessible to other kernel instances. Whilst this is the default for non-array data, it must be explicitly applied to array fields in order to make them private. Aparapi now supports arrays in the private memory space. @@ -12,10 +11,18 @@ The private memory space is generally only suitable for smallish arrays, but is ##Details In Aparapi there are two mechanisms available to mark a Kernel class member as belonging to the private memory space when mapped to OpenCL code (matching the equivalent functionality for marking items as belonging to the local memory space). Either the field can be named with a suffix plus buffer size, for example - protected short[] myBuffer_$private$32 = new short[32]; +```java + +protected short[] myBuffer_$private$32 = new short[32]; +``` + or using the Annotation Kernel.PrivateMemorySpace, for example - protected @PrivateMemorySpace(32) short[] myBuffer = new short[32]; +```java + +protected @PrivateMemorySpace(32) short[] myBuffer = new short[32]; +``` + The latter should be used in preference to the former. Note that OpenCL requires that the size of a private array be fixed at compile time for any kernel. Thus it is not possible for a single Kernel subclass to support private buffers of varying size. Unfortunately this may entail creating multiple subclasses with varying buffer sizes in order to most efficiently support varying private buffer sizes. diff --git a/source/documentation/profiling-the-kernel.html.md b/source/documentation/profiling-the-kernel.html.md new file mode 100644 index 0000000000000000000000000000000000000000..5fd6c22a1e128ab40ce61e04b1ea4d83234fc443 --- /dev/null +++ b/source/documentation/profiling-the-kernel.html.md @@ -0,0 +1,66 @@ +--- +title: Profiling the Kernel +description: Using Aparapi's built in profiling APIs. +--- + +If you want to extract OpenCL performance info from a kernel at runtime you need to set the property :- + +``` + +-Dcom.aparapi.enableProfiling=true +``` + +Your application can then call kernel.getProfileInfo() after a successful call to kernel.execute(range) to extract a List List<ProfileInfo>. + +Each ProfileInfo holds timing information for buffer writes, executs and buffer reads. + +The following code will print a simple table of profile information + +```java + +List<ProfileInfo> profileInfo = k.getProfileInfo(); +for (final ProfileInfo p : profileInfo) { + System.out.print(" " + p.getType() + " " + p.getLabel() + " " + (p.getStart() / 1000) + " .. " + + (p.getEnd() / 1000) + " " + ((p.getEnd() - p.getStart()) / 1000) + "us"); + System.out.println(); +} +``` + +Here is an example implementation + +```java + +final float result[] = new float[2048*2048]; +Kernel k = new Kernel(){ + public void run(){ + final int gid=getGlobalId(); + result[gid] =0f; + } +}; +k.execute(result.length); +List<ProfileInfo> profileInfo = k.getProfileInfo(); + +for (final ProfileInfo p : profileInfo) { + System.out.print(" " + p.getType() + " " + p.getLabel() + " " + (p.getStart() / 1000) + " .. " + + (p.getEnd() / 1000) + " " + ((p.getEnd() - p.getStart()) / 1000) + "us"); + System.out.println(); +} +k.dispose(); +``` + +And here is the tabular output from + +``` + +java + -Djava.library.path=${APARAPI_HOME} + -Dcom.aparapi.enableProfiling=true + -cp ${APARAPI_HOME}:. + MyClass + +W val$result 69500 .. 72694 3194us +X exec() 72694 .. 72835 141us +R val$result 75327 .. 78225 2898us +``` + +The table shows that the transfer of the 'result' buffer to the device ('W') took 3194 us (micro seconds), the execute ('X') of the kernel 141 us and the read ('R') of resulting buffer 2898 us. \ No newline at end of file diff --git a/source/documentation/SettingUpLinuxHSAMachineForAparapi.html.md b/source/documentation/setting-up-hsa.html.md similarity index 71% rename from source/documentation/SettingUpLinuxHSAMachineForAparapi.html.md rename to source/documentation/setting-up-hsa.html.md index 162dc0e6b730140c0e6fa4c54f86932f0d001142..3e58897254c5aff9d45aa83f32fc50967272d697 100644 --- a/source/documentation/SettingUpLinuxHSAMachineForAparapi.html.md +++ b/source/documentation/setting-up-hsa.html.md @@ -1,9 +1,8 @@ --- - title: SettingUpLinuxHSAMachineForAparapi +title: Setting Up HSA +description: How to setup a HSA machine for testing HSA enabled Aparapi --- -*How to setup a Linux HSA machine for testing HSA enabled Aparapi Updated May 22, 2014 by frost.g...@gmail.com* - * HSA Videos * [http://www.youtube.com/watch?v=5ntILiXTuhE](http://www.youtube.com/watch?v=5ntILiXTuhE) * [http://www.youtube.com/watch?v=caEPq4KvTTA](http://www.youtube.com/watch?v=caEPq4KvTTA) @@ -68,72 +67,102 @@ Until all of the HSA drivers and features are available in stock linux and have ##A Ubuntu compatible kernel can be pulled from github - $ cd ~ # I put all of this in my home dir - $ sudo apt-get install git - $ git clone https://github.com/HSAFoundation/Linux-HSA-Drivers-And-Images-AMD.git +```bash + +$ cd ~ # I put all of this in my home dir +$ sudo apt-get install git +$ git clone https://github.com/HSAFoundation/Linux-HSA-Drivers-And-Images-AMD.git +``` + Or you can pull the zip and unzip using curl if you don't have git - $ cd ~ # I put all of this in my home dir - $ curl -L https://github.com/HSAFoundation/Linux-HSA-Drivers-And-Images-AMD/archive/master.zip > drivers.zip - $ unzip drivers.zip +```bash + +$ cd ~ # I put all of this in my home dir +$ curl -L https://github.com/HSAFoundation/Linux-HSA-Drivers-And-Images-AMD/archive/master.zip > drivers.zip +$ unzip drivers.zip +``` + This will create the following subdir on your machine - Linux-HSA-Drivers-And-Images-AMD/ - LICENSE - README.md - ubuntu12.10-based-alpha1/ - xorg.conf - linux-image-3.13.0-kfd+_3.13.0-kfd+-2_amd64.deb +``` + +Linux-HSA-Drivers-And-Images-AMD/ + LICENSE + README.md + ubuntu12.10-based-alpha1/ + xorg.conf + linux-image-3.13.0-kfd+_3.13.0-kfd+-2_amd64.deb +``` From here we can install our new image and setup the HSA KFD (the driver for HSA)and reboot to the new kernel. - $ cd ~/Linux-HSA-Drivers-And-Images-AMD - $ echo "KERNEL==\"kfd\", MODE=\"0666\"" | sudo tee /etc/udev/rules.d/kfd.rules - $ sudo dpkg -i ubuntu13.10-based-alpha1/linux-image-3.13.0-kfd+_3.13.0-kfd+-2_amd64.deb - $ sudo cp ~/Linux-HSA-Drivers-And-Images-AMD/ubuntu13.10-based-alpha1/xorg.conf /etc/X11 - $ sudo reboot +```bash + +$ cd ~/Linux-HSA-Drivers-And-Images-AMD +$ echo "KERNEL==\"kfd\", MODE=\"0666\"" | sudo tee /etc/udev/rules.d/kfd.rules +$ sudo dpkg -i ubuntu13.10-based-alpha1/linux-image-3.13.0-kfd+_3.13.0-kfd+-2_amd64.deb +$ sudo cp ~/Linux-HSA-Drivers-And-Images-AMD/ubuntu13.10-based-alpha1/xorg.conf /etc/X11 +$ sudo reboot +``` + ##Installing OKRA RT Now we need a runtime for executing HSAIL code. We share common infrastructure used by our sister OpenJDK project called Sumatra. Both Aparapi and Sumatra use OKRA to execute HSAIL code on a HSA enabled platform. We can get the latest version using of OKRA (Offloadable Kernel Runtime API) from another HSA foundation repository. - $ cd ~ # I put all of this in my home dir - $ git clone https://github.com/HSAFoundation/Okra-Interface-to-HSA-Device.git +```bash + +$ cd ~ # I put all of this in my home dir +$ git clone https://github.com/HSAFoundation/Okra-Interface-to-HSA-Device.git +``` + or if you prefer curl/unzip - $ cd ~ # I put all of this in my home dir - $ curl -L https://github.com/HSAFoundation/Okra-Interface-to-HSA-Device/archive/master.zip > okra.zip - $ unzip okra.zip +```bash + +$ cd ~ # I put all of this in my home dir +$ curl -L https://github.com/HSAFoundation/Okra-Interface-to-HSA-Device/archive/master.zip > okra.zip +$ unzip okra.zip +``` + This will create the following dir structure. - Okra-Interface-to-HSA-Device/ - README.md - okra/ - README - dist/ - okra.jar - bin/ - libamdhsacl64.so - libnewhsacore64.so - libokra_x86_64.so - include/ - common.h - okraContext.h - - samples/ - dist/ - Squares - Squares.hsail - runSquares.sh +```java + +Okra-Interface-to-HSA-Device/ + README.md + okra/ + README + dist/ + okra.jar + bin/ + libamdhsacl64.so + libnewhsacore64.so + libokra_x86_64.so + include/ + common.h + okraContext.h + + samples/ + dist/ + Squares + Squares.hsail + runSquares.sh +``` OKRA offers a C API (for those that are so inclined ;) ) as well as a java jar file which contains JNI wrappers. ##Sanity check your HSA and OKRA install So to sanity check your install you can run a small sample app (binary) - $ cd ~/Okra-Interface-to-HSA-Device/okra/samples/ - $ sh runSquares.sh +```bash + +$ cd ~/Okra-Interface-to-HSA-Device/okra/samples/ +$ sh runSquares.sh +``` + If everything is OK this should run the C Squares test app. Congratulations, you have executed your first HSA enabled app. @@ -145,45 +174,58 @@ My recommendation is to download AMD-APP-SDK-v2.9-lnx64.tgz from [http://develop Note that we have nested zipped jars in this archive. - $ cd ~ - $ gunzip ~/Downloads/AMD-APP-SDK-v2.9-lnx64.tgz - $ tar xvf ~/Downloads/AMD-APP-SDK-v2.9-lnx64.tar - $ rm ~/default-install_lnx_64.pl ~/icd-registration.tgz ~/Install-AMD-APP.sh ~/ReadMe.txt - $ gunzip ~/AMD-APP-SDK-v2.9-RC-lnx64.tgz - $ tar xvf ~/AMD-APP-SDK-v2.9-RC-lnx64.tar - $ rm ~/AMD-APP-SDK-v2.9-RC-lnx64.tar - $ rm -rf AMD-APP-SDK-v2.9-RC-lnx64/samples +```bash + +$ cd ~ +$ gunzip ~/Downloads/AMD-APP-SDK-v2.9-lnx64.tgz +$ tar xvf ~/Downloads/AMD-APP-SDK-v2.9-lnx64.tar +$ rm ~/default-install_lnx_64.pl ~/icd-registration.tgz ~/Install-AMD-APP.sh ~/ReadMe.txt +$ gunzip ~/AMD-APP-SDK-v2.9-RC-lnx64.tgz +$ tar xvf ~/AMD-APP-SDK-v2.9-RC-lnx64.tar +$ rm ~/AMD-APP-SDK-v2.9-RC-lnx64.tar +$ rm -rf AMD-APP-SDK-v2.9-RC-lnx64/samples +``` + Note where AMD-APP-SDK-v2.9-RC-lnx64 is located, you need this in the following step. ##You will need Java 8 Download Java 8 JDK from [https://jdk8.java.net/download.html](https://jdk8.java.net/download.html) I chose to download the zipped tar and not install with RPM so I can control the location of the install. - $ cd ~ - $ gunzip /home/gfrost/Downloads/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz - $ tar xvf ~/Downloads/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar +```bash + +$ cd ~ +$ gunzip /home/gfrost/Downloads/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz +$ tar xvf ~/Downloads/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar +``` + I now have ~/jdk1.8.0 as my java 8 install dir. Alternatively the following will pull from Oracles site using curl - $ cd ~ - $ curl http://download.java.net/jdk8/archive/b132/binaries/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz?q=download/jdk8/archive/b132/binaries/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz > jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz - $ gunzip jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz - $ tar xvf jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar -I now have ~/jdk1.8.0 as my java 8 install dir. +```bash -##You will need ant - $ sudo apt-get install ant -This takes a long time because in also installs a java7 jdk. +$ cd ~ +$ curl http://download.java.net/jdk8/archive/b132/binaries/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz?q=download/jdk8/archive/b132/binaries/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz > jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz +$ gunzip jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz +$ tar xvf jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar +``` + +I now have ~/jdk1.8.0 as my java 8 install dir. ##You will need g++ We use g++ to build the JNI side of Aparapi - $ sudo apt-get install g++ +```bash + +$ sudo apt-get install g++ +``` + ##Pulling the HSA enabled Aparapi branch and building -Now we can pull the Aparapi lambda/HSA branch from SVN +Now we can pull the Aparapi lambda/HSA branch from GIT - $ sudo apt-get install subversion - $ svn checkout https://aparapi.googlecode.com/svn/branches/lambda aparapi-lambda + $ sudo apt-get install git + $ git clone https://github.com/Syncleus/aparapi-ambda.git + If you are familiar with Aparapi structure then this tree should not be that much of a surprise but there are a few subtle changes. Specifically the build system has been changed to support OKRA, Aparapi JNI code is provided as a Java agent and the execution scripts all refer to ${APARAPI_HOME}/env.sh to setup a reasonable execution environment. @@ -201,12 +243,19 @@ Here are how I set my vars. It is recommended (thanks notzed ;) ) that you test your env.sh using sh env.sh until it stops reporting errors. Once you have finished I recommend sourcing it into your current shell before building with ant. - $ cd ~aparapi-lambda - $ . env.sh - $ ant +```java + +$ cd ~aparapi-lambda +$ . env.sh +$ mvn +``` + If you get any problems check the env.sh vars first. If all is well you should be able to run some samples. - $ cd ~/aparapi-lambda/samples/mandel - $ sh hsailmandel.sh \ No newline at end of file +```java + +$ cd ~/aparapi-lambda/samples/mandel +$ sh hsailmandel.sh +``` \ No newline at end of file diff --git a/source/documentation/UnitTestGuide.html.md b/source/documentation/unit-tests.html.md similarity index 60% rename from source/documentation/UnitTestGuide.html.md rename to source/documentation/unit-tests.html.md index 799c89313989c438e0f400daf2f85248a76a5cb8..009047ead8350f9e00d95801cdf9f006550a4277 100644 --- a/source/documentation/UnitTestGuide.html.md +++ b/source/documentation/unit-tests.html.md @@ -1,9 +1,8 @@ --- - title: UnitTestGuide +title: Unit Tests +description: Unit test Guide Find out how to run Junit tests and how to add new tests. --- -*Unit test Guide Find out how to run Junit tests and how to add new tests. Updated Sep 14, 2011 by frost.g...@gmail.com* - #Unit Test Guide The Unit Test Guide explains the test infrastructure associated with Aparapi, including instructions for executing existing tests adding new test cases. @@ -11,17 +10,14 @@ OpenCLâ„¢ code generation tests The initial open source tree includes the codegen subdirectory (test/codegen), which used to validate the Aparapi bytecode to OpenCLâ„¢ conversion. - aparapi/trunk/ - com.aparapi/ - src/java/com.aparapi/ - build.xml - test/ - codegen/ - src/java/ - com.aparapi/ - com.aparapi.test/ - build.xml - build.xml +``` + +/src/test/java/ + com/aparapi/ + codegen/ + test/ + pom.xml +``` The code generation tests to not require OpenCLâ„¢ , AMD APP SDK or a GPU devices to be configured; these tests only validate the creation of valid OpenCLâ„¢ code by comparing against predefined expected output. @@ -31,22 +27,34 @@ Before executing the code generation tests, build the com.aparapi sub-project an Edit the junit.jar property in test/codegen/build.xml to point to your install directory. - <property name="junit.jar" value="C:\JUnit4.9\junit-4.9.jar"/> +```xml + +<property name="junit.jar" value="C:\JUnit4.9\junit-4.9.jar"/> +``` Initiate the code generation tests using ant. - C:\> cd tests/codegen - C:\> ant - <failures will be reported here> - C:> +``` + +C:\> cd tests/codegen +C:\> ant +<failures will be reported here> +C:> +``` View the HTML version of the JUnit report at junit/html/index.html. On Microsoft Windows(r) platforms use - C:\> start junit\html\index.html +``` + +C:\> start junit\html\index.html +``` On Linux(r) platforms just invoke your browser (Firefox in this case). - C:\> firefox junit\html\index.html +``` + +firefox junit\html\index.html +``` ##Adding a new OpenCLâ„¢ code generation test @@ -56,37 +64,40 @@ The package `com.aparapi.test (codegen/src/java/com/aparapi/test)` contains all Here is an example that tests the code generation resulting from a call to Kernel.getPassId(), this is taken from com.aparapi.test.CallGetPassId - package com.aparapi.test; - - import com.aparapi.Kernel; - - public class CallGetPassId extends Kernel{ - public void run() { - int thePassId = getPassId(); - } - - } - /**{OpenCL{ - - typedef struct This_s{ - int passid; - }This; - int get_pass_id(This *this){ - return this->passid; - } - __kernel void run( - int passid - ){ - This thisStruct; - This* this=&thisStruct; - this->passid = passid; - { - int thePassId = get_pass_id(this); - return; - } - } - - }OpenCL}**/ +```java + +package com.aparapi.test; + +import com.aparapi.Kernel; + +public class CallGetPassId extends Kernel{ + public void run() { + int thePassId = getPassId(); + } + +} +/**{OpenCL{ + +typedef struct This_s{ + int passid; +}This; +int get_pass_id(This *this){ + return this->passid; +} +__kernel void run( + int passid +){ + This thisStruct; + This* this=&thisStruct; + this->passid = passid; + { + int thePassId = get_pass_id(this); + return; + } +} + +}OpenCL}**/ +``` The test source takes the form of a simple class that extends the kernel and a block of OpenCL code between the /**{OpenCL{ and }OpenCL}**/ markers. The code between these markers is the OpenCL code that we expect Aparapi to produce as a result of converting the run() method to OpenCL. @@ -102,76 +113,88 @@ To create a new test case, just add your test case to the `codegen/src/java/com/ Sometimes different javac implementations (such as Oracle and Eclipse) will generate different bytecode for the same source. When Aparapi converts this bytecode it may yield different (but equally acceptable) OpenCL forms. One example of this is the BooleanToggle test: - public class BooleanToggle{ - public void run() { - boolean pass = false; +```java + +public class BooleanToggle{ + public void run() { + boolean pass = false; - pass = !pass; + pass = !pass; - } - } + } +} +``` The BooleanToggle test code creates two (slightly different) versions of OpenCLâ„¢ (sadly one line different) depending on the javac compiler. This example shows the ‘toggle’ OpenCLâ„¢ created from the bytecode generated by Oracle. - pass = pass==1?0:1; +```java + +pass = pass==1?0:1; +``` This example shows the bytecode from Eclipse javac: - pass = pass==0?1:0; +```java + +pass = pass==0?1:0; +``` Logically either of the above are correct. However, to accommodate the alternate acceptable forms we need to add two complete `/**{OpenCL{ and }OpenCL}**/` sections to the file. If either matches, the test will pass. Here is the complete BooleanToggle code. - package com.aparapi.test; - - public class BooleanToggle{ - public void run() { - boolean pass = false; - - pass = !pass; - - } - } - /**{OpenCL{ - typedef struct This_s{ - int passid; - }This; - int get_pass_id(This *this){ - return this->passid; - } - __kernel void run( - int passid - ){ - This thisStruct; - This* this=&thisStruct; - this->passid = passid; - { - char pass = 0; - pass = (pass==0)?1:0; - return; - } - } - }OpenCL}**/ - /**{OpenCL{ - typedef struct This_s{ - int passid; - }This; - int get_pass_id(This *this){ - return this->passid; - } - __kernel void run( - int passid - ){ - This thisStruct; - This* this=&thisStruct; - this->passid = passid; - { - char pass = 0; - pass = (pass!=0)?0:1; - return; - } - } - }OpenCL}**/ \ No newline at end of file +```java + +package com.aparapi.test; + +public class BooleanToggle{ + public void run() { + boolean pass = false; + + pass = !pass; + + } +} +/**{OpenCL{ +typedef struct This_s{ + int passid; +}This; +int get_pass_id(This *this){ + return this->passid; +} +__kernel void run( + int passid +){ + This thisStruct; + This* this=&thisStruct; + this->passid = passid; + { + char pass = 0; + pass = (pass==0)?1:0; + return; + } +} +}OpenCL}**/ +/**{OpenCL{ +typedef struct This_s{ + int passid; +}This; +int get_pass_id(This *this){ + return this->passid; +} +__kernel void run( + int passid +){ + This thisStruct; + This* this=&thisStruct; + this->passid = passid; + { + char pass = 0; + pass = (pass!=0)?0:1; + return; + } +} +}OpenCL}**/ +``` \ No newline at end of file diff --git a/source/documentation/UsingAparapiLambdaBranchWithHSASimulator.html.md b/source/documentation/using-hsa-simulator.html.md similarity index 53% rename from source/documentation/UsingAparapiLambdaBranchWithHSASimulator.html.md rename to source/documentation/using-hsa-simulator.html.md index 3c268300c0273bb58ab3e63cf540fef84352f5e8..fd56034093f57dc746beb74d97f41fc20e0b3b39 100644 --- a/source/documentation/UsingAparapiLambdaBranchWithHSASimulator.html.md +++ b/source/documentation/using-hsa-simulator.html.md @@ -1,9 +1,8 @@ --- - title: UsingAparapiLambdaBranchWithHSASimulator +title: Using HSA Simulator +description: Using Aparapi lambda branch with HSA Simulator. --- -*One-sentence summary of this page. Updated Feb 28, 2014 by frost.g...@gmail.com* - ##Introduction Although HSA compatible devices are available, we understand that Aparapi developers may not have access to these devices. @@ -18,13 +17,19 @@ We assume you have ant, svn and g++ available because you can build other aparap You will also need git, libelf-dev, libdwarf-dev, flex and cmake - $ sudo apt-get install git libelf-dev libdwarf-dev flex cmake +```java + +$ sudo apt-get install git libelf-dev libdwarf-dev flex cmake +``` login... - $ git clone https://github.com/HSAFoundation/Okra-Interface-to-HSAIL-Simulator.git okra - $ cd okra - $ ant -f build-okra-sim.xml +```java + +$ git clone https://github.com/HSAFoundation/Okra-Interface-to-HSAIL-Simulator.git okra +$ cd okra +$ ant -f build-okra-sim.xml +``` ##The build should take approximately 15 mins. @@ -35,15 +40,17 @@ Assuming your Java8 JDK is in /home/gfrost/jdk1.8.0 Assuming your aparapi svn trunk is /home/gfrost/aparapi - $ export JAVA_HOME=/home/gfrost/jdk1.8.0 - $ export OKRA=/home/gfrost/okra - $ export PATH=${PATH}:${JAVA_HOME}/bin:${OKRA}/dist/bin - $ java -version - java version "1.8.0-ea" - Java(TM) SE Runtime Environment (build 1.8.0-ea-b94) - Java HotSpot(TM) 64-Bit Server VM (build 25.0-b36, mixed mode) - $ cd /home/gfrost/aparapi/branches/lambda - $ ant - $ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OKRA}/dist/bin - $ java -agentpath:com.aparapi.jni/dist/libaparapi_x86_64.so -cp com.aparapi/dist/aparapi.jar:${OKRA}/dist/okra.jar hsailtest.Squares - $ \ No newline at end of file +```java + +$ export JAVA_HOME=/home/gfrost/jdk1.8.0 +$ export OKRA=/home/gfrost/okra +$ export PATH=${PATH}:${JAVA_HOME}/bin:${OKRA}/dist/bin +$ java -version +java version "1.8.0-ea" +Java(TM) SE Runtime Environment (build 1.8.0-ea-b94) +Java HotSpot(TM) 64-Bit Server VM (build 25.0-b36, mixed mode) +$ cd /home/gfrost/aparapi/branches/lambda +$ ant +$ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OKRA}/dist/bin +$ java -agentpath:com.aparapi.jni/dist/libaparapi_x86_64.so -cp com.aparapi/dist/aparapi.jar:${OKRA}/dist/okra.jar hsailtest.Squares +``` \ No newline at end of file diff --git a/source/documentation/FrequentlyAskedQuestions.html.md b/source/introduction/faq.html.md similarity index 97% rename from source/documentation/FrequentlyAskedQuestions.html.md rename to source/introduction/faq.html.md index 60deaed9d9b2f00c9a734508f55dbd273dc7e36d..d1044c5da82ca79a90624432849b753659056f1a 100644 --- a/source/documentation/FrequentlyAskedQuestions.html.md +++ b/source/introduction/faq.html.md @@ -1,9 +1,8 @@ --- - title: FrequentlyAskedQuestions +title: FAQ +description: Frequently Asked Questions. --- -*Frequently Asked Questions Updated Oct 17, 2012 by frost.g...@gmail.com* - ##Frequently Asked Questions ##Why is this project called Aparapi and how is it pronounced? @@ -88,12 +87,15 @@ Yes, we do ship a small JNI shim to handle the host OpenCL calls. From within the Java code itself you can query the execution mode after Kernel.execute(n) has returned. - Kernel kernel = new Kernel(){ - @Override public void run(){ - } - } ; - kernel.execute(1024); - System.out.priintln(“Execution mode = “+kernel.getExecutionMode()); +```java + +Kernel kernel = new Kernel(){ + @Override public void run(){ + } +} ; +kernel.execute(1024); +System.out.priintln(“Execution mode = “+kernel.getExecutionMode()); +``` The above code fragment will print either ‘GPU’ if the kernel executed on the GPU or JTP if Aparapi executed the Kernel in a Java Thread Pool. diff --git a/source/documentation/AddressSpacesUsingBuffers.html.md b/source/proposals/address-space-with-buffers.html.md similarity index 56% rename from source/documentation/AddressSpacesUsingBuffers.html.md rename to source/proposals/address-space-with-buffers.html.md index 0b4a8bce63fabdc3673d55a1b7d01127fc6fac20..b51530776ddb21ac8aa7a76179633c4317902a8b 100644 --- a/source/documentation/AddressSpacesUsingBuffers.html.md +++ b/source/proposals/address-space-with-buffers.html.md @@ -1,8 +1,8 @@ --- - title: AddressSpacesUsingBuffers +title: Address Space with Buffers +description: Discussion of OpenCL address space support using java Buffers instead of arrays. --- -*Proposal For OpenCL address space support using java Buffers instead of arrays. Updated Dec 8, 2011 by frost.g...@gmail.com* The general idea is to have a AS_PRIMTYPE_Buffer for each AS=address space and PRIM=primitive type. Here is an example for LocalFloatBuffer which would be a buffer for floats that got mapped to OpenCL local address space. As with normal FloatBuffers, the float elements are accessed using get and put methods @@ -11,7 +11,11 @@ Although a LocalFloatBuffer conceptually exists only for the lifetime of a workg A typical declaration would be: - LocalFloatBuffer locbuf = new LocalFloatBuffer{12); +```java + +LocalFloatBuffer locbuf = new LocalFloatBuffer{12); +``` + The argument 12 here means that 12 floats would be used by each workitem in the workgroup. So the total buffer would be LocalSize*12 floats. Aparapi would at runtime allocate a total local OpenCL buffer to be this size. Note how this removes the need for the programmer to specify localSize anywhere. Note: For each Kernel.Entry.execute(globalSize) call, the runtime will determine an appropriate workgroup size, also called localSize, depending on the capabilities of the device, and on the globalSize. The localSize will always evenly divide the globalSize, in other words all workgroups for an execute context will be the same size. A workitem can determine localSize by calling getLocalSize(). @@ -20,28 +24,32 @@ Because workitems operate simultaneously and in an undetermined order, workitems Some example code (from NBody) follows. Here each workitem copies a "BODY" consisting of 4 floats. The global array contains 4*globalSize floats, and we want to iterate thru this global array, copying it into local memory and operating on it there. This will take globalSize/localSize "tiles". For each tile, each workitem fills in one "BODY"'s worth or 4 elements - // outside run method... - final int BODYSIZE = 4; - LocalFloatBuffer pos_xyzm_local = new LocalFloatBuffer(BODYSIZE); - // - // inside run method... - int numTiles = globalSize / localSize; - for (int i = 0; i < numTiles; ++i) { - // load one tile into local memory - int idx = i * localSize + localId; // index into a global memory array - localBarrier(); - pos_xyzm_local.put(localId * BODYSIZE + 0, pos_xyzm[idx * BODYSIZE + 0]); - pos_xyzm_local.put(localId * BODYSIZE + 1, pos_xyzm[idx * BODYSIZE + 1]); - pos_xyzm_local.put(localId * BODYSIZE + 2, pos_xyzm[idx * BODYSIZE + 2]); - pos_xyzm_local.put(localId * BODYSIZE + 3, pos_xyzm[idx * BODYSIZE + 3]); - // Synchronize to make sure data is available for processing - localBarrier(); - - // now the entire LocalFloatBuffer has been filled. - // each workitem might use the entire Buffer - // which consists of localSize BODYs - for (int j = 0; j < localSize; ++j) { - float r_x = pos_xyzm_local.get(j * BODYSIZE + 0) - myPos_x; - float r_y = pos_xyzm_local.get(j * BODYSIZE + 1) - myPos_y; - float r_z = pos_xyzm_local.get(j * BODYSIZE + 2) - myPos_z; - // ...etc \ No newline at end of file + +```java + +// outside run method... +final int BODYSIZE = 4; +LocalFloatBuffer pos_xyzm_local = new LocalFloatBuffer(BODYSIZE); +// +// inside run method... +int numTiles = globalSize / localSize; +for (int i = 0; i < numTiles; ++i) { + // load one tile into local memory + int idx = i * localSize + localId; // index into a global memory array + localBarrier(); + pos_xyzm_local.put(localId * BODYSIZE + 0, pos_xyzm[idx * BODYSIZE + 0]); + pos_xyzm_local.put(localId * BODYSIZE + 1, pos_xyzm[idx * BODYSIZE + 1]); + pos_xyzm_local.put(localId * BODYSIZE + 2, pos_xyzm[idx * BODYSIZE + 2]); + pos_xyzm_local.put(localId * BODYSIZE + 3, pos_xyzm[idx * BODYSIZE + 3]); + // Synchronize to make sure data is available for processing + localBarrier(); + + // now the entire LocalFloatBuffer has been filled. + // each workitem might use the entire Buffer + // which consists of localSize BODYs + for (int j = 0; j < localSize; ++j) { + float r_x = pos_xyzm_local.get(j * BODYSIZE + 0) - myPos_x; + float r_y = pos_xyzm_local.get(j * BODYSIZE + 1) - myPos_y; + float r_z = pos_xyzm_local.get(j * BODYSIZE + 2) - myPos_z; + // ...etc +``` \ No newline at end of file diff --git a/source/documentation/DeviceProposal.html.md b/source/proposals/device.html.md similarity index 51% rename from source/documentation/DeviceProposal.html.md rename to source/proposals/device.html.md index a9426efeba933e5384f057cf519111631446874c..fdeb3abd93ae744775f1b8f912e094b93709ab1a 100644 --- a/source/documentation/DeviceProposal.html.md +++ b/source/proposals/device.html.md @@ -1,9 +1,8 @@ --- - title: DeviceProposal +title: Device +description: How we might use the extension mechanism devices for general Kernel execution. --- -*How we might use the extension mechanism devices for general Kernel execution. Updated May 9, 2012 by frost.g...@gmail.com* - At present the first GPU or CPU device (depending on Kernel.ExecutionMode value) is chosen at execution time. This make it easy to execute simple Kernels, but is problematic when using some advanced feature (barriers, local memory) or for sizing buffers appropriate for the target device. I propose that we add API's to allow the developer to specify exactly which device we intend to target. In the extension proposal branch we needed to expose a Device class for binding arbitrary OpenCL to a Java interface. I suggest we also be use this to query device information useful for allocating suitable size global buffers/local buffers, and for dispatching Kernel's to specific devices. @@ -12,10 +11,14 @@ The general pattern would be that we ask Aparapi to give us a Device, probably v Something like:- - Device device = Device.best(); -We would also offer other useful factory methods `getBestGPU(), getFirstCPU() getJavaMultiThread(), getJavaSequential()` as well as a method to get all device so that the developer can filter themselves. +```java + +Device device = Device.best(); +``` + +We would also offer other useful factory methods `getBestGPU()`, `getFirstCPU()`, `getJavaMultiThread()`, `getJavaSequential()` as well as a method to get all device so that the developer can filter themselves. -Note that as well as real OpenCL devices we also expose 'pseudo' devices such as JavaMultiThread and Sequential. We might also allow pseudo devices to group multiple devices. So getAllGPUDevices() might return a pseudo device for executing across devices. +Note that as well as real OpenCL devices we also expose 'pseudo' devices such as JavaMultiThread and Sequential. We might also allow pseudo devices to group multiple devices. So `getAllGPUDevices()` might return a pseudo device for executing across devices. Device chosen=null; for (Device device: devices.getAll()){ @@ -25,44 +28,63 @@ Note that as well as real OpenCL devices we also expose 'pseudo' devices such as } } -A Device can be queried `(isGPU(), isOpenCL(), isGroup(), isJava(), getOpenCLPlatform(), getMaxMemory(), getLocalSizes())` and may need to be cast to specific types. +A Device can be queried (`isGPU()`, `isOpenCL()`, `isGroup()`, `isJava()`, `getOpenCLPlatform()`, `getMaxMemory()`, `getLocalSizes()`) and may need to be cast to specific types. This would allow us to configure buffers. - Device device = Device.best(); - if (device instanceof OpenCLDevice){ - OpenCLDevice openCLDevice = (OpenCLDevice)device; - char input[] = new char[openCLDevice.getMaxMemory()/4); - } +```java + +Device device = Device.best(); +if (device instanceof OpenCLDevice){ + OpenCLDevice openCLDevice = (OpenCLDevice)device; + char input[] = new char[openCLDevice.getMaxMemory()/4); +} +``` + We can also use the Device as a factory for creating Ranges. - Range range = device.createRange2D(width, height); +```java + +Range range = device.createRange2D(width, height); +``` + This allows the Range to be created with knowledge of the underlying device. So for example `device.createRange3D(1024, 1024, 1024, 16, 16, 16)` will fail if the device does not allow a local size of (16x16x16). A range created using `device.createRangeXX()` would also capture the device that created it. As if we had - Range range = device.createRange2D(width, height); - // implied range.setDevice(device); - This basically means that the Range locks the device that it can be used with. +```java - So when we have a Kernel. +Range range = device.createRange2D(width, height); +// implied range.setDevice(device); +This basically means that the Range locks the device that it can be used with. - Kernel kernel = new Kernel(){ - @Override public void run(){ - ... - } +So when we have a Kernel. + +Kernel kernel = new Kernel(){ + @Override public void run(){ + ... } +} +``` + And we then use - Device device = Device.firstGPU(); - final char input[] = new char[((OpenCLDevice)device).getMaxMemory()/4); - Kernel kernel = new Kernel(){ - @Override public void run(){ - // uses input[]; - } - }; - range = device.createRange2D(1024, 1024); - kernel.execute(range); +```java + +Device device = Device.firstGPU(); +final char input[] = new char[((OpenCLDevice)device).getMaxMemory()/4); +Kernel kernel = new Kernel(){ + @Override public void run(){ + // uses input[]; + } +}; +range = device.createRange2D(1024, 1024); +kernel.execute(range); +``` + We have forced execution on the first GPU. Java fallback would still be possible (should we forbid this?). - kernel.execute( Device.firstGPU().getRange2D(width, height)); +```java + +kernel.execute( Device.firstGPU().getRange2D(width, height)); +``` diff --git a/source/proposals/extensions.html.md b/source/proposals/extensions.html.md new file mode 100644 index 0000000000000000000000000000000000000000..d938bc9f7a75a6f050c2b1b62510745f40f07e3b --- /dev/null +++ b/source/proposals/extensions.html.md @@ -0,0 +1,293 @@ +--- +title: Extensions +description: A proposed aparapi extension mechanism. +--- + +##Here is a proposed Aparapi extension mechanism + +This would allow a developer to create a library that could be used by Aparapi Kernel code. The library would include OpenCL and Java implementations. + +We will treat this as a live document. Please join the discussions at http://groups.google.com/group/aparapi-discuss/browse_thread/thread/7ec81ecb2169aa4 and I will update this page to reflect what I think the latest decisions are:- + +Currently Aparapi allows Java bytecode to be converted to OpenCL at runtime. Only the OpenCL generated by this conversion process is made available. Sometimes for performance reasons we might want to allow hand coded OpenCL to be called from Aparapi kernel code. + +Here we will present a strawman API which would allow extension points to be added by an end user or by a library provider. + +We will use an FFT usecase to walk through the steps. + +The FFT (Fast Fourier Transform) algorithm can be coded in Aparapi, but for performance reasons handcrafted OpenCL is likely to be more performant. The goal is to allow Aparapi to do what it does best, i.e. manage the host buffer allocations and provide a mechanism for binding arbitrary opencl code at runtime. + +So lets assume we wanted an Aparapi Kernel to be able to call an Aparapi extension for computing FFT (forward and reverse). The Kernel implementation might look like this. + +```java + +public static class BandStopFilter extends Kernel{ + FFT fft = new FFT(); // Create an instance of the Extension point. + float[] real; + float[] imaginary; + + BandStopFilter (float[] _real){ + real = _real; + imaginary = new float[_real.length]; + + } + + @Override public void run() { + fft.forward(real, imaginary); + } +} +``` + +The main method then would just execute the Kernel using the familiar kernel.execute() method :- + +```java + +public static void main(String[] args) { + float[] data = new float[1024]; + BandStopFilter kernel = new BandStopFilter (data); + kernel.execute(data.length); +} +``` + +Essentially we want the `FFT.forward(float[] _real, float[] _imaginary)` and `FFT.reverse(float[] _real, float[] _imaginary)` methods to be callable from Aparapi Kernel code. We want Aparapi to handle the call-forwarding and the argument/buffer mapping transfers. We want Aparapi to call the Java methods normally if OpenCL is not available but would like Aparapi to use the implementor provided OpenCL if it is. So the implementor will be required to provide both a Java and an OpenCL version of the callable methods because Aparapi will decide which version needs to be called ant runtime. + +Any extension point is required to implement the AparapiExtensionPoint interface. + +```java + +public class AparapiExtensionPoint + public String getOpenCL(); +} +``` + +Here is a possible (although incomplete) FFT implementation. + +```java + +public class FFT implements AparapiExtensionPoint{ + @AparapiCallable public void forward( + @Global @ReadWrite float[] _data, + @Global @ReadWrite float[] _imaginary) { + // java implementation + } + + @AparapiCallable public void reverse( + @Global @ReadWrite float[] _data, + @Global @ReadWrite float[] _imaginary) { + // java implementation + } + + @Override public String getOpenCL() { + return "" + +"void my_package_FFT_forward(" + +" __global float* _real," + +" __global float* _imaginary )" + +" {" + +" // OpenCL implemention" + +" }" + +"void my_package_FFT_reverse(" + +" __global float* _real," + +" __global float* _imaginary )" + +" {" + +" // OpenCL implemention" + +" }"; + } +} +``` + +The implementer’s class will be required to define the callable aparapi methods as well as implement the `getOpenCL()` method so that the OpenCL implementation of those methods can be extracted at run-time. + +Aparapi will provide annotations to decorate the methods and args/parameters of the exposed callable methods . These annotations provide information so that Aparapi locate the callable methods as well as parameter hints to help coordinate buffer types (global, local, constant) and transfer directions (read,write, readWrite) when executing the methods from a Kernel. This information is consulted during the normal bytecode analysis that Aparapi provides when Aparapi hits the call site. + +Note that the Java code inside the `@AparapiCallable` functions (or code executed from it) is not constrained to the normal Aparapi subset. It can be any legitimate Java code, but should be thread safe (because it will be called from JTP mode!). + +Note also that the OpenCL code yielded from the `getOpenCL()` method is assumed to be complete, Aparapi does not attempt to parse this code. If the code fails to compile Aparapi will fallback and execute the whole Kernel in JTP mode. + +BTW we show getOpenCL() returning a String literal. This is most likely to be how code is returned. However, it could be extracted from a File? a resource in the Jar file? or dynamically generated based on some state. For example an FFT implementation might choose to use different code for radix2 or radix4 implementations (based on a paramater passed to `FFT()` constructor - say `FFT(FFT.RADIX2))` in which case the getOpenCL() method might yield different code. + +The above proposal covers the case where a third party might want to provide an Aparapi extension point as a library. + +We might also consider allowing single methods within the Kernel to be optimized, where the OpenCL is made available via the AparapiCallable annotation. The method would still use the same Annotations for the args (to allow buffer txfers to be optimized). + +```java + +Kernel k = new Kernel(){ + @AparapiCallable(†/* opencl code for sum() goes here */â€) + int sum(@Global @ReadWrite int[] data, int length){ + int sum = 0; + for (int v:data){ + sum+=v; + } + } + @Override public void run(){ + sum(data); + } +} +``` + +Here are the proposed new interfaces/annotations + +```java + +public interface AparapiExtensionPoint { + public String getOpenCL(); +} +@Retention(RetentionPolicy.RUNTIME) @Target(ElementType.METHOD) +public @interface AparapiCallable { + String value default NULL; +} + +@Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) +public @interface Global {} + +@Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) +public @interface Local {} + +@Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) +public @interface Constant {} + +@Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) +public @interface ReadWrite {} + +@Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) +public @interface ReadOnly {} + +@Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER) +public @interface WriteOnly {} +``` + +And here is the example code in one chunk + +```java + +public class FFT implements AparapiExtensionPoint{ + @AparapiCallable public void forward( + @Global @ReadWrite float[] _data, + @Global @ReadWrite float[] _imaginary) { + // java implementation + } + + @AparapiCallable public void reverse( + @Global @ReadWrite float[] _data, + @Global @ReadWrite float[] _imaginary) { + // java implementation + } + + @Override public String getOpenCL() { + return "" + +"void my_package_FFT_forward(" + +" __global float* _real," + +" __global float* _imaginary )" + +" {" + +" // OpenCL implemention" + +" }" + +"void my_package_FFT_reverse(" + +" __global float* _real," + +" __global float* _imaginary )" + +" {" + +" // OpenCL implemention" + +" }"; + } +} + +public class BandStopFilter extends Kernel{ + FFT fft = new FFT(); + float[] real; + float[] imaginary; + + BandStopFilter (float[] _real){ + real = _real; + imaginary = new float[_real.length]; + + } + + @Override public void run() { + fft.forward(real, imaginary); + } +} + +public static void main(String[] args) { + float[] data = new float[1024]; + BandStopFilter kernel = new BandStopFilter (data); + kernel.execute(data.length); +} +``` + +After discussion I think we are converging on a less complex solution. This is based on Witold's feedback suggestion (see below) where we use OpenCL annotations rather than forcing the implementation of the interface and the `getOpenCL()` method as originally suggested. + +So we will create an `@OpenCL` annotation for classes/methods. + +The `@OpenCL` annotation on the methods will contain the OpenCL source replacement for a specific method. The arg list will be created by Aparapi. + +The @OpenCL annotation on a class allows us to optionally introduce common code (helper methods, #pragmas, constants) which will precede the method declarations in the OpenCL code. + +So an FFT example whereby forward() and reverse() methods both called a common foo() method might look like this. + +```java + +@OpenCL(common="/* common void foo(){} + maybe #pragmas + accessable +global fields declared here */") +public class FFT extends AparapiExtensionPoint { + @OpenCL(signature="//function signature - OPTIONAL", body="{ /* uses foo(); */ }") + public void forward( + @Global @ReadWrite float[] _data, + @Global @ReadWrite float[] _imaginary) { + // java implementation + } + @OpenCL(function="{ /*uses foo(); */) }") + public void reverse( + @Global @ReadWrite float[] _data, + @Global @ReadWrite float[] _imaginary) { + // java implementation + } + } +} +``` + +To invoke from an Aparapi kernel. We should be able to do something like + +```java + +public class BandStopFilter extends Kernel{ + FFT fft = new FFT(); + float[] real; + float[] imaginary; + + BandStopFilter (float[] _real){ + real = _real; + imaginary = new float[_real.length]; + + } + + @Override public void run() { + fft.forward(this, real, imaginary); + } + } + + public static void main(String[] args) { + float[] data = new float[1024]; + BandStopFilter kernel = new BandStopFilter (data); + kernel.execute(data.length); + } +} +``` + +Ideally we would also like to invoke FFT directly (instead of via a Kernel). This is tricky because the forward()} and reverse() methods will need to be invoked across a range and of course the dispatch across the range needs to be initiated from Aparapi. + +The only way I can see how to do this is to force the creation of an interface so we can use Java's existing Proxy mechanism to create a wrapper. + +```java + +@OpenCL(wraps=FFT.class); +interface FFTInterface{ + public void forward( Range _range, float[] _data, float[] _imaginary); + public void reverse( Range _range, float[] _data, float[] _imaginary); +} +Then provide a mechanism for extracting a proxy and invoking it. + +float[] real = //?? +float[] imag = //?? +Aparapi.wrap<FFT>(FFTInterface.class).forward(range, real, imag); +``` + +I can't see a cleaner solution. diff --git a/source/proposals/lambda-syntax.html.md b/source/proposals/lambda-syntax.html.md new file mode 100644 index 0000000000000000000000000000000000000000..513156b8dbab8b5575d61f74b99adf0f66f3d9e6 --- /dev/null +++ b/source/proposals/lambda-syntax.html.md @@ -0,0 +1,161 @@ +--- +title: Lambda Syntax +description: Syntax suggestions for HSA enabled Aparapi. +--- + +#Introduction + +Now that Java 8 is nearly upon us and HSA enabled Aparapi 'lambda' branch is usable (though in no way complete) I figured we could use this page to discuss the 'programming model' we might prefer for Aparapi, and contrast with the API's for the new Java 8 lambda based stream APIs. + +##Converting between Aparapi HSA + Java 8 enabled Aparapi + +Our **hello world** app has always been the ''vector add''. In classic Aparapi we could transform + +```java + +final float inA[] = .... // get a float array from somewhere +final float inB[] = .... // get a float from somewhere + // assume (inA.length==inB.length) +final float result = new float[inA.length]; + +for (int i=0; i<array.length; i++){ + result[i]=intA[i]+inB[i]; +} +``` + +to + +```java + +Kernel kernel = new Kernel(){ + @Override public void run(){ + int i= getGlobalId(); + result[i]=intA[i]+inB[i]; + } +}; +Range range = Range.create(result.length); +kernel.execute(range); +``` + +For the lambda aparapi branch we can currently use + +```java + +Device.hsa().forEach(result.length, i-> result[i]=intA[i]+inB[i]); +``` + +Note that the closest Java 8 construct is + +```java + +IntStream.range(0, result.length).parallel().forEach(i-> result[i]=intA[i]+inB[i]); +``` + +Aparapi and Java 8 stream API's both use IntConsumer as the lambda type. So you can reuse the lambda. + +```java + +IntConsumer lambda = i-> result[i]=intA[i]+inB[i]; + +IntStream.range(0, result.length).parallel().forEach(lambda); +Device.hsa().forEach(result.length, lambda); +``` + +Exposing the Deviceness of this was a conscious effort. We may also hide it completely. + +```java + +IntConsumer lambda = i-> result[i]=intA[i]+inB[i]; + +IntStream.range(0, result.length).parallel().forEach(lambda); +Aparapi.forEach(result.length, lambda); +``` + +I am toying with providing an API which maps more closely to the Stream API from Java 8. + +Maybe + +```java + +IntStream.range(0, result.length).parallel().forEach(lambda); +Aparapi.range(0, result.length).parallel().forEach(lambda); +``` + +This way users can more readily swap between the two. + +For collections/arrays in Aparapi we can also offer + +```java + +T[] arr = // get an array of T from somewhere +ArrayList<T> list = // get an array backed list of T from somewhere + +Aparapi.range(arr).forEach(t -> /* do something with each T */); +``` + +We can create special cases. Say for mutating images + +```java + +BufferedImage in, out; +Aparapi.forEachPixel(in, out, rgb[] -> rgb[0] = 0 ); +``` + +We may also need select operations for associative operations + +```java + +class Person{ + int age; + String first; + String last; +}; + +Aparapi.selectOne(Person[] people, (p1,p2)-> p1.age>p2.age?p1:p2 ); +``` + +##A case for map reduce + +A mapper maps from one type to another. Possibly by extracting state. Here is a mapper which maps each String in an array of Strings to its length. + +As if the mapper was + +```java + +interface mapToInt<T>{ int map(T v); } +``` + +Here it is in action. + +```java + +Aparapi.range(strings).map(s->string.length())... +``` + +Now the result is a stream of int's which can be 'reduced' by a reduction lambda. + +In this case the reduction reduces two int's to one, by choosing the max of k and v. All reductions must be commutative style operations (max, min, add) where the order of execution is not important. + +```java + +int lengthOfLongestString = Aparapi.range(strings).map(s->string.length()).reduce((k,v)-> k>v?k:v); +``` + +Here we had a sum reduction. + +```java + +int sumOfLengths = Aparapi.range(strings).map(s ->string.length()).reduce((k,v)-> k+v); +``` + +Some of these may be common enough that we offer direct functionality. + +```java + +int sumOfLengths = Aparapi.range(strings).map(s ->string.length()).sum(); +int maxOfLengths = Aparapi.range(strings).map(s ->string.length()).max(); +int minOfLengths = Aparapi.range(strings).map(s ->string.length()).min(); +String string = Aparapi.range(strings).map(s->string.length()).select((k,v)-> k>v); +``` + +This last one needs some explaining. We map String to int then select the String whose length is the greatest. \ No newline at end of file diff --git a/source/documentation/AddingLambdasToAparapi.html.md b/source/proposals/lambdas.html.md similarity index 74% rename from source/documentation/AddingLambdasToAparapi.html.md rename to source/proposals/lambdas.html.md index fc4ea03b153cd015fcabfaa231dd4b6eb6a938ff..121e44fe9d7eab1b413728ae4ae0a4cd27a2cd45 100644 --- a/source/documentation/AddingLambdasToAparapi.html.md +++ b/source/proposals/lambdas.html.md @@ -1,49 +1,66 @@ --- - title: AddingLambdasToAparapi +title: Lambdas +description: Proposals for Java 8 Lambda Support to Aparapi. --- -*Adding Java 8 Lambda Support to Aparapi Updated Jun 24, 2013 by frost.g...@gmail.com* - In the recently added ''lambda'' branch we have been experimenting with adding lambda support to Aparapi. We believe that this upcomming Java 8 feature will be a natural way to express parallel algorithms which can be executed on the GPU. A link to the branch can be found here preview. -You will need to get the latest binary build of ''Project Lambda'' to experiment with these new features. The 'Project Lambda' preview can be found here. +You will need to get the latest binary build of ''Project Lambda'' to experiment with these new features. Once you have a Lambda enabled Java 8 JDK Java set JAVA_HOME to your Java8 Lambda enabled compiler and build Aparapi. So from the root of SumatraExperiments just use - $ ant +```bash + +$ mvn +``` + We are slowly walking through some of the Aparapi demos and converting them. At present NBody and Mandel have been converted. With Lambda enabled Aparapi we remove the need to derive from a base Kernel class, we will allow the user to express their code as a lambda using the following basic pattern - Device.bestGPU().forEach(int range, IntConsumer lambda); +```java + +Device.bestGPU().forEach(int range, IntConsumer lambda); +``` + The Java 8 stream API defines a type called java.util.function.IntConsumer. This is essentially an interface with a Single Abstract Method (these types are referred to as SAM types in the stream API code). IntConsumer looks something like.... - interface IntConsumer{ - public void accept(int Id); - } +```java + +interface IntConsumer{ + public void accept(int Id); +} +``` + So you can run the familiar 'squares' kernel using - int in[] = ..// - int out[] = .../ - Device.bestGPU().forEach(in.length, (i)->{ - out[i] = in[i] * in[i]; - }); +```java + +int in[] = ..// +int out[] = .../ +Device.bestGPU().forEach(in.length, (i)->{ + out[i] = in[i] * in[i]; + }); + ``` Instead of - int in[] = ..// - int out[] = .../ - Device.bestGPU().forEach(in.length, new IntConsumer(){ - public void accept(int i){ - out[i] = in[i] * in[i]; - } - }); +```java + +int in[] = ..// +int out[] = .../ +Device.bestGPU().forEach(in.length, new IntConsumer(){ + public void accept(int i){ + out[i] = in[i] * in[i]; + } + }); +``` To accomodate lambda's we created Device.forEach(int range, IntConsumer ic) which converts the bytecode of the ic parameter to OpenCL at runtime. The captured args (in, out and i - in this case) are passed to the GPU and the kernel executed. @@ -57,11 +74,18 @@ This will mean that in future we will change how Aparapi is launched. Instead of - $ java -Djava.library.path=path/to/aparapi -classpath path/to/aparapi/aparapi.jar:your.jar YourClass +```bash + +$ java -Djava.library.path=path/to/aparapi -classpath path/to/aparapi/aparapi.jar:your.jar YourClass +``` We will use - $ java -agentlib=path/to/aparapi/aparapi.dll -classpath path/to/aparapi/aparapi.jar:your.jar YourClass +```bash + +$ java -agentlib=path/to/aparapi/aparapi.dll -classpath path/to/aparapi/aparapi.jar:your.jar YourClass +``` + We are also looking into the possibility of having this agent provide the bytecode for all Aparapi classes. We believe that this will enable us to ultimately remove MethodModel/ClassModel and even the InstructionSet classes and handling all of this in JNI. We would welcome comments on these proposals. Either here, or in the discussion list. Let us know what you think. @@ -79,31 +103,58 @@ We would welcome comments on these proposals. Either here, or in the discussion This version would allow us to carry over Aparapi's device selection - Device.bestGPU().forEach(1024, i->{lambda}); +```java + +Device.bestGPU().forEach(1024, i->{lambda}); +``` + This version would allow us to carry over Aparapi's Range selection - Device.bestGPU().range2D(width, height).forEach(1024, rid->{lambda}); +```java + +Device.bestGPU().range2D(width, height).forEach(1024, rid->{lambda}); +``` + This version would allow us to mimic Kernel.execute(1024, 5) - Device.bestGPU().forEach(1024, 5, (id, passid)->{lambda}); +```java + +Device.bestGPU().forEach(1024, 5, (id, passid)->{lambda}); +``` + We could even have the range iterated over until some other lambda determines we are done - Device.bestGPU().forEachUntil(1024, id->{lambda}, ->{predicate lambda}); +```java + +Device.bestGPU().forEachUntil(1024, id->{lambda}, ->{predicate lambda}); +``` + Explicit buffer handling could be removed in many cases by allowing the bytecode of the 'until' predicate to be snooped for buffer references. - int lotsOfData[] = ...; - boolean found[false] = new boolean[1]; - Device.bestGPU().forEachUntil(1024, 5, - (id, passid)->{ /* mutate lotsOfData, found[0]=true when done */ } - ->{found[0]]}); +```java + +int lotsOfData[] = ...; +boolean found[false] = new boolean[1]; +Device.bestGPU().forEachUntil(1024, 5, + (id, passid)->{ /* mutate lotsOfData, found[0]=true when done */ } + ->{found[0]]}); +``` + In the above cases Aparapi can determine that between each pass it needs to ''ONLY'' copy found[] back from the device. There is no reason that the range itself needs to be constant, we can use a collection/iterable. This helps with some reductions. - int range[] = new int[]{1024,512,128,64,32,16,8,4,2,1,0}; - Device.bestGPU().forEach(range,{lambda}); +```java + +int range[] = new int[]{1024,512,128,64,32,16,8,4,2,1,0}; +Device.bestGPU().forEach(range,{lambda}); +``` + or the range can be a lambda itself, here we specify a start and end value for the range itself, and a lambda to provide each step. - Device.bestGPU().forEach(1024, 1, r->{return(r/2);},(pass, r, id)->{lambda}); - // or - Device.bestGPU().forEach(1, 1024, r->{return(r*2);},(pass, r, id)->{lambda}); +```java + +Device.bestGPU().forEach(1024, 1, r->{return(r/2);},(pass, r, id)->{lambda}); +// or +Device.bestGPU().forEach(1, 1024, r->{return(r*2);},(pass, r, id)->{lambda}); +``` diff --git a/source/proposals/multiple-dim-nd-range.html.md b/source/proposals/multiple-dim-nd-range.html.md new file mode 100644 index 0000000000000000000000000000000000000000..c7c35374989efdb55630de0dedd84898301c1ef8 --- /dev/null +++ b/source/proposals/multiple-dim-nd-range.html.md @@ -0,0 +1,240 @@ +--- +title: Multiple Dim ND Range +description: A proposal for accessing multi-dim ND range execution. +--- + +We can discuss this proposal either here (in comments) or via the discussion list here. + +Note this is nothing to do with accessing Java 2D arrays in Aparapi. This discussion is focused on the ability to expose the execution of kernels over 1, 2 or 3 dimensions. The memory in each case is a single contiguous region (like a single dimension primitive array). + +At present an Aparapi kernel can only be executed using a single dimension. If we wish to represent execution over WIDTH x HEIGHT element grid we would execute over the range (WIDTH*HEIGHT) and manually divide/mod getGlobalID() by WIDTH to determine the x and y for each. + +Similarly we would multiply y by WIDTH and add x (y*WIDTH+x) to convert an X,Y location to a linear global id + +```java + +final static int WIDTH=128; +final static int HEIGHT=64; +final int in[] = new int[WIDTH*HEIGHT]; +final int out[] = new int[WIDTH*HEIGHT]; +Kernel kernel = new Kernel(){ + public void run(){ + int x = getGlobaId()%WIDTH; + int y = getGlobalID()/WIDTH; + if (!(x==1 || x==(WIDTH-1) || y==1 || y==(HEIGHT-1)){ + int sum = 0; + for (int dx =-1; dx<2; dx++){ + for (int dy =-1; dy<2; dy++){ + sum+=in[(y+dy)*WIDTH+(x+dx)]; + } + } + out[y*WIDTH+x] = sum/9; + // or out[getGlobalID()] = sum/9; + } + } + +}; +kernel.execute(WIDTH*HEIGHT); +``` + +OpenCL natively allows the user to execute over 1, 2 or 3 dimension grids via the clEnqueueNDRangeKernel() method. + +We chose not to expose this in Aparapi but there have been requests for us to allow it. + +There are a number of things to consider here: + +1. Extending the syntax of kernel.execute() to allow multi dimensional grids. +1. Mapping Kernel methods to OpenCL's get_local_id(int dim), get_local_size(int dim), get_group_id(int_dim), etc. At present we map kernel.getGlobalId() to get_local_id(0). +1. Handling all of these when an application drops back to JTP mode. + +##Extending Kernel.execute(int range) + +Sadly we can't overload Kernel.execute(int range), Kernel.execute(int xrange, int yrange) and Kernel.execute(int xrange, int yrange, int zrange) because we already have kernel.execute(int, int) mapped for executing mutiple passes over the linear range. + +Remember + +```java + +for (int pass=0; pass<20; pass++){ + kernel(1024); +} +``` + +Is equivalent to + +```java + +kernel(1024, 20); +``` + +I think I would prefer + +```java + +Kernel.execute(int range) +Kernel.execute(int range, int passes) +Kernel.executeXY(int xrange, int yrange) +Kernel.executeXY(int xrange, int yrange, int passes) +Kernel.executeXYZ(int xrange, int yrange, int zrange) +Kernel.executeXYZ(int xrange, int yrange, int zrange, int passes) +``` + +Obviously in the above calls we are only supplying the global bounds for the grid. We could also provide mappings allowing local ranges. I think I would prefer + +```java + +Kernel.executeLocal(int range, int local) +Kernel.executeLocal(int range, int local, int passes) +Kernel.executeXYLocal(int xrange, int yrange, int xlocalrange, int ylocalrange) +Kernel.executeXYLocal(int xrange, int yrange, int xlocalrange, int ylocalrange, int passes) +Kernel.executeXYZLocal(int xrange, int yrange, int zrange, int xlocalrange, int ylocalrange, int zlocalrange) +Kernel.executeXYZLocal(int xrange, int yrange, int zrange, int xlocalrange, int ylocalrange, int zlocalrange, int passes) +``` + +Another alternative may be to create Range classes + +```java + +class Range{ + int passes; + int width; + static Range create(int width); + static Range create(int width, int passes); +} + +class Range2D extends Range{ + int height; + static Range create(int width, int height); + static Range create(int width, int height, int passes); + +} + +class Range3D extends Range2D{ + int depth; + static Range create(int width, int height); + static Range create(int width, int height, int passes); +} +``` + +With appropriate constructors (or factory methods) to allow + +```java + +Kernel.execute(Range range) +``` + +Then execute would be simply. + +```java + +Kernel.execute(Range.create(1,1)) +``` + +We can also arrange for the group size to be placed in the base Range class. + +```java + +class Range{ + int groupSize; + int passes; + int width; + static Range create(int width); + static Range create(int width, int passes); +} +``` + +##Mapping to OpenCL multi dim methods. i.e get_global_id(1), get_local_size(2) etc + +We could just add getGlobalId(int dim), getLocalSize(int dim) etc to replicate OpenCL methods. + +I would prefer to offer the following global mappings + +|Kernel | OpenCL| +|-----|------| +|getGlobalId()| get_global_id(0)| +|getGlobalX()| get_global_id(0)| +|getGlobalY()| get_global_id(1)| +|getGlobalZ()| get_global_id(2)| +|getGlobalSize()| get_global_size(0)| +|getGlobalWidth()| get_global_size(0)| +|getGlobalHeight()| get_global_size(1)| +|getGlobalDepth()| get_global_size(2)| + +And the following local mappings + +|Kernel| OpenCL| +|-----|-------| +|getLocalId()| get_local_id(0)| +|getLocalX()| get_local_id(0)| +|getLocalY()| get_local_id(1)| +|getLocalZ()| get_local_id(2)| +|getLocalSize()| get_local_size(0)| +|getLocalWidth()| get_local_size(0)| +|getLocalHeight()| get_local_size(1)| +|getLocalDepth()| get_local_size(2)| + +##An example + +```java + +final static int WIDTH=128; +final static int HEIGHT=64; +final int in[] = new int[WIDTH*HEIGHT]; +final int out[] = new int[WIDTH*HEIGHT]; +Kernel kernel = new Kernel(){ + public void run(){ + int x = getGlobalX(); + int y = getGlobalY(); + if (!(x==1 || x==(getGlobalWidth()-1) || y==1 || y==(getGlobalHeight()-1)){ + int sum = 0; + for (int dx =-1; dx<2; dx++){ + for (int dy =-1; dy<2; dy++){ + sum+=in[(y+dy)*getGlobalWidth()+(x+dx)]; + } + } + out[y*getGlobalWidth()+x] = sum/9; + // or out[getGlobalID()] = sum/9; + } + } + +}; +kernel.executeXY(WIDTH, HEIGHT); +``` + +Or if we choose the Range class approach. + +```java + +final static int WIDTH=128; +final static int HEIGHT=64; +final int in[] = new int[WIDTH*HEIGHT]; +final int out[] = new int[WIDTH*HEIGHT]; +Kernel kernel = new Kernel(){ + public void run(){ + int x = getGlobalX(); + int y = getGlobalY(); + if (!(x==1 || x==(getGlobalWidth()-1) || y==1 || y==(getGlobalHeight()-1)){ + int sum = 0; + for (int dx =-1; dx<2; dx++){ + for (int dy =-1; dy<2; dy++){ + sum+=in[(y+dy)*getGlobalWidth()+(x+dx)]; + } + } + out[y*getGlobalWidth()+x] = sum/9; + // or out[getGlobalID()] = sum/9; + } + } + +}; +kernel.execute(Range2D.create(WIDTH, HEIGHT)); +``` + +##Handling this from JTP mode + +Mapping to OpenCL for this is all fairly straightforward. + +In Java JTP mode we will have to emulate this. For `get_global_id(0..3)` (`getGlobalX()`, `getGlobalY()` and `getGlobalZ()` using our proposed Aparapi Java mappings) we can of course easily offer reasonable implementations, this just requires the Java code to essentially nest 3 loops (or emulate) and set `globalX`, `globalY`, `globalZ` inside each nesting. + +For `get_local_size(0..3)` (`getLocalWidth()`, `getLocalHeight()` and `getLocalDepth()` using our proposed Aparapi Java mappings) we will need to break the globalWidth/globalHeight and globalDepth into some arbitrary equal 'chunks' (note I am avoiding using the word groups here to avoid confusion with `get_group_size(0..3)`! + +At present we always create a synthetic group in JTP mode which is the the # or cores. This will need to be changed. If the user requests a grid (64,64,8,8) (global width 64, global height 64, local width 8, local height 8) then we will have to create a JTP group of 64 (8x8) and just in case the kernel code contains a barrier, we will need to ensure we launch 64 threads for this group. From our experience it is best to launch one thread per core, so we may lose some JTP performance executing in this mode. \ No newline at end of file diff --git a/source/proposals/multiple-entry-points.html.md b/source/proposals/multiple-entry-points.html.md new file mode 100644 index 0000000000000000000000000000000000000000..eb2ad45ae9660def3977c3136224f1ab7d303763 --- /dev/null +++ b/source/proposals/multiple-entry-points.html.md @@ -0,0 +1,437 @@ +--- +title: Multiple Entry Points +description: How to extend Aparapi to allow multiple entrypoints for kernels. +--- + +##The Current Single Entrypoint World + +At present Aparapi allows us to dispatch execution to a single 'single entry point' in a Kernel. Essentially for each Kernel only the overridden Kernel.run() method can be used to initiate execution on the GPU. + +Our canonical example is the 'Squarer' Kernel which allows us to create squares for each element in an input array in an output array. + +```java + +Kernel squarer = new Kernel(){ + @Overide public void run(){ + int id = getGlobalId(0); + out[id] = in[id] * in[id]; + } +}; +``` + +If we wanted a vector addition Kernel we would have to create a whole new Kernel. + +```java + +Kernel adder = new Kernel(){ + @Overide public void run(){ + int id = getGlobalId(0); + out[id] = in[id] * in[id]; + } +}; +``` + +For us to square and then add a constant we would have to invoke two kernels. Or of course create single SquarerAdder kernel. + +See this page EmulatingMultipleEntrypointsUsingCurrentAPI for ideas on how to emulate having multiple methods, by passing data to a single run() method. + +##Why can't Aparapi just allow 'arbitary' methods + +Ideally we would just expose a more natural API, one which allows us to provide specific methods for each arithmetic operation. + +Essentially + +```java + +class VectorKernel extends Kernel{ + public void add(); + public void sub(); + public void sqr(); + public void sqrt(); +} +``` + +Unfortunately this is hard to implement using Aparapi. There are two distinct problems, both at runtime. + +* How will Aparapi know which of the available methods we want to execute when we call Kernel.execute(range)? +* On first execution how does Aparapi determine which methods might be entrypoints and are therefore need to be converted to OpenCL? + +The first problem can be solved by extending Kernel.execute() to accept a method name + +```java + +kernel.execute(SIZE, "add"); +``` + +This is the obvious solution, but really causes maintenence issues int that it trades compile time reporting for a runtime errors. If a developer mistypes the name of the method, :- + +```java + +kernel.execute(SIZE, "sadd"); // there is no such method +``` + +The code will compile perfectly, only at runtime will we detect that there is no such method. + +##An aside + +Maybe the new Java 8 method reference feature method might help here. In the paper below Brian Goetz talks about a double-colon syntax (Class::Method) for directly referencing a method which is presumably checked at compile time. + +So presumably + +```java + +kernel.execute(SIZE, VectorKernel::add); +``` + +Would compile just fine, whereby + +```java + +kernel.execute(SIZE, VectorKernel::sadd); +``` + +Would yield a compile time error. + +See Brian Goetz's excellent Lambda documentation + +##back from Aside + +The second problem (knowing which methods need to be converted to OpenCL) can probably be solved using an Annotation. + +```java + +class VectorKernel extends Kernel{ + @EntryPoint public void add(); + @EntryPoint public void sub(); + @EntryPoint public void sqr(); + @EntryPoint public void sqrt(); + public void nonOpenCLMethod(); +} +``` + +Here the @EntryPoint annotation allows the Aparapi runtime to determine which methods need to be exposed. + +#My Extension Proposal + +Here is my proposal. Not only does it allow us to reference multiple entryoints, but I think it actually improves the single entrypoint API, albeit at the cost of being more verbose. + +##The developer must provide an API interface + +First I propose that we should ask the developer to provide an interface for all methods that we wish to execute on the GPU (or convert to OpenCL). + +```java + +interface VectorAPI extends AparapiAPI { + public void add(Range range); + public void sub(Range range); + public void sqrt(Range range); + public void sqr(Range range); +} +``` + +Note that each API takes a Range, this will make more sense in a moment. +##The developer provides a bound implementation + +Aparapi should provide a mechanism for mapping the proposed implementation of the API to it's implementation. + +Note the weasel words here, this is not a conventional implementation of an interface. We will use an annotation (@Implements(Class class)) to provide the binding. + +```java + +@Implements(VectorAPI.class) class Vector extends Kernel { + public void add(RangeId rangeId){/*implementation here */} + public void sub(RangeId rangeId){/*implementation here */} + public void sqrt(RangeId rangeId){/*implementation here */} + public void sqr(RangeId rangeId){/*implementation here */} + public void public void nonOpenCLMethod(); +} +``` + +##Why we can't the implementation just implement the interface? + +This would be ideal. Sadly we need to intercept a call to say VectorAPI.add(Range) and dispatch to the resulting Vector.add(RangeId) instances. If you look at the signatures, the interface accepts a Range as it's arg (the range over which we intend to execute) whereas the implementation (either called by JTP threads or GPU OpenCL dispatch) receives a RangeId (containing the unique globalId, localId, etc fields). At the very end of this page I show a strawman implementation of a sequential loop implementation. + +##So how do we get an implementation of VectorAPI + +We instantiate our Kernel by creating an instance using new. We then ask this instance to create an API instance. Some presumably java.util.Proxy trickery will create an implementation of the actual instance, backed by the Java implementation. + +So execution would look something like. + +```java + +Vector kernel = new Vector(); +VectorAPI kernelApi = kernel.api(); +Range range = Range.create(SIZE); +kernalApi.add(range); +``` + +So the Vector instance is a pure Java implementation. The extracted API is the bridge to the GPU. + +Of course then we can also execute using an inline call through api() + +```java + +Vector kernel = new Vector(); +Range range = Range.create(SIZE); +kernel.api().add(range); +kernel.api().sqrt(range); +``` + +or even expose api as public final fields + +```java + +Vector kernel = new Vector(); +Range range = Range.create(SIZE); +kernel.api.add(range); +kernel.api.sqrt(range); +``` + +##How would our canonical Squarer example look + +```java + +interface SquarerAPI extends AparapiAPI{ + square(Range range); +} + +@Implement(SquarerAPI) class Squarer extends Kernel{ + int in[]; + int square[]; + public void square(RangeId rangeId){ + square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; + } +} +``` + +Then we execute using + +```java + +Squarer squarer = new Squarer(); +// fill squarer.in[SIZE] +// create squarer.values[SIZE]; + + +squarer.api().square(Range.create(SIZE)); +``` + +#Extending this proposal to allow argument passing + +Note that we have effectively replaced the use of the 'abstract' squarer.execute(range) with the more concrete squarer.api().add(range). + +Now I would like to propose that we take one more step by allowing us to pass arguments to our methods. + +Normally Aparapi captures buffer and field accesses to create the args that it passes to the generated OpenCL code. In our canonical squarer example the `in[]` and `square[]` buffers are captured from the bytecode and passed (behind the scenes) to the OpenCL. + +However, by exposing the actual method we want to execute, we could also allow the API to accept parameters. + +So our squarer example would go from + +```java + +interface SquarerAPI extends AparapiAPI{ + square(Range range); +} + +@Implement(SquarerAPI) class Squarer extends Kernel{ + int in[]; + int square[]; + public void square(RangeId rangeId){ + square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; + } +} + + +Squarer squarer = new Squarer(); +// fill squarer.in[SIZE] +// create squarer.values[SIZE]; + +squarer.api().square(Range.create(SIZE)); +``` + +to + +```java + +interface SquarerAPI extends AparapiAPI{ + square(Range range, int[] in, int[] square); +} + +@Implement(SquarerAPI) class Squarer extends Kernel{ + public void square(RangeId rangeId, int[] in, int[] square){ + square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; + } +} + + +Squarer squarer = new Squarer(); +int[] in = // create and fill squarer.in[SIZE] +int[] square = // create squarer.values[SIZE]; + +squarer.api().square(Range.create(SIZE), in, result); +``` + +I think that this makes Aparapi look more conventional. It also allows us to allow overloading for the first time. + +```java + +interface SquarerAPI extends AparapiAPI{ + square(Range range, int[] in, int[] square); + square(Range range, float[] in, float[] square); +} + +@Implement(SquarerAPI) class Squarer extends Kernel{ + public void square(RangeId rangeId, int[] in, int[] square){ + square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; + } + public void square(RangeId rangeId, float[] in, float[] square){ + square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid]; + } +} + + +Squarer squarer = new Squarer(); +int[] in = // create and fill squarer.in[SIZE] +int[] square = // create squarer.values[SIZE]; + +squarer.api().square(Range.create(SIZE), in, result); +float[] inf = // create and fill squarer.in[SIZE] +float[] squaref = // create squarer.values[SIZE]; + +squarer.api().square(Range.create(SIZE), inf, resultf); +``` + +test harness + +```java + +import java.lang.reflect.InvocationHandler; +import java.lang.reflect.Method; +import java.lang.reflect.Proxy; + + +public class Ideal{ + + public static class OpenCLInvocationHandler<T> implements InvocationHandler { + Object instance; + OpenCLInvocationHandler(Object _instance){ + instance = _instance; + } + @Override public Object invoke(Object interfaceThis, Method interfaceMethod, Object[] interfaceArgs) throws Throwable { + Class clazz = instance.getClass(); + + Class[] argTypes = interfaceMethod.getParameterTypes(); + argTypes[0]=RangeId.class; + Method method = clazz.getDeclaredMethod(interfaceMethod.getName(), argTypes); + + + if (method == null){ + System.out.println("can't find method"); + }else{ + RangeId rangeId = new RangeId((Range)interfaceArgs[0]); + interfaceArgs[0]=rangeId; + for (rangeId.wgid = 0; rangeId.wgid <rangeId.r.width; rangeId.wgid++){ + method.invoke(instance, interfaceArgs); + } + } + + return null; + } + } + + static class Range{ + int width; + Range(int _width) { + width = _width; + } + } + + static class Range2D extends Range{ + int height; + + Range2D(int _width, int _height) { + super(_width); + height = _height; + } + } + + static class Range1DId<T extends Range>{ + Range1DId(T _r){ + r = _r; + } + T r; + + int wgid, wlid, wgsize, wlsize, wgroup; + } + + static class RangeId extends Range1DId<Range>{ + RangeId(Range r){ + super(r); + } + } + + static class Range2DId extends Range1DId<Range2D>{ + Range2DId(Range2D r){ + super(r); + } + + int hgid, hlid, hgsize, hlsize, hgroup; + } + + + + + + static <T> T create(Object _instance, Class<T> _interface) { + OpenCLInvocationHandler<T> invocationHandler = new OpenCLInvocationHandler<T>(_instance); + T instance = (T) Proxy.newProxyInstance(Ideal.class.getClassLoader(), new Class[] { + _interface, + + }, invocationHandler); + return (instance); + + } + + + + public static class Squarer{ + interface API { + public API foo(Range range, int[] in, int[] out); + public Squarer dispatch(); + + } + + public API foo(RangeId rangeId, int[] in, int[] out) { + out[rangeId.wgid] = in[rangeId.wgid]*in[rangeId.wgid]; + return(null); + } + } + + /** + * @param args + */ + public static void main(String[] args) { + + Squarer.API squarer = create(new Squarer(), Squarer.API.class); + int[] in = new int[] { + 1, + 2, + 3, + 4, + 5, + 6 + }; + int[] out = new int[in.length]; + Range range = new Range(in.length); + + squarer.foo(range, in, out); + + for (int s:out){ + System.out.println(s); + } + + } + +} +``` \ No newline at end of file