diff --git a/src/main/java/com/aparapi/internal/kernel/KernelRunner.java b/src/main/java/com/aparapi/internal/kernel/KernelRunner.java index 070c0da741245f1ef1cefcee0ea26aaef529acd9..bc324d17f78fe902bc5e801ba4ccb5d8319b18a3 100644 --- a/src/main/java/com/aparapi/internal/kernel/KernelRunner.java +++ b/src/main/java/com/aparapi/internal/kernel/KernelRunner.java @@ -457,7 +457,7 @@ public class KernelRunner extends KernelRunnerJNI{ final int numGroups0 = _settings.range.getNumGroups(0); final int numGroups1 = _settings.range.getNumGroups(1); final int globalGroups = numGroups0 * numGroups1 * _settings.range.getNumGroups(2); - + /** * This localBarrier is only ever used by the kernels. If the kernel does not use the barrier the threads * can get out of sync, we promised nothing in JTP mode. @@ -557,14 +557,24 @@ public class KernelRunner extends KernelRunnerJNI{ @Override public void set(KernelState kernelState, int globalGroupId, int threadId) { // (kernelState, globalGroupId, threadId) ->{ - kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth = (for 33 = 1 % 4 = 1) - kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0) - - final int groupInset = globalGroupId % numGroups0; // 4%3 = 1 - kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5 - - final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2 - kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2 + final int localId0 = (threadId % localSize0); + final int localId1 = (threadId / localSize0); + kernelState.setLocalId(0, localId0); // threadId % localWidth = (for 33 = 1 % 4 = 1) + kernelState.setLocalId(1, localId1); // threadId / localWidth = (for 33 = 1 / 4 == 0) + + //The displacement in the overall 2D computation grid in the X direction is + //the offset in X given by the current group being executed, plus the X displacement + //inside that work-group. + //Groups are like this: + //[Group 0] [Group 1] [Group 2] + //[Group 3] [Group 4] [Group 5] + final int globalThreadIdOffsetX = (globalGroupId % numGroups0) * localSize0; + kernelState.setGlobalId(0, globalThreadIdOffsetX + localId0); + + //Likewise X, but now for the Y direction. + final int globalThreadIdOffsetY = (globalGroupId / numGroups0) * localSize1; + kernelState.setGlobalId(1, globalThreadIdOffsetY + localId1); + kernelState.setGroupId(0, (globalGroupId % numGroups0)); kernelState.setGroupId(1, (globalGroupId / numGroups0)); }