From 46557fcc82738926c3dd99e09baa82b48a620f04 Mon Sep 17 00:00:00 2001
From: Gary Frost <frost.gary@gmail.com>
Date: Sat, 19 May 2012 14:05:53 +0000
Subject: [PATCH]

---
 .../aparapi/sample/extension/FFTExample.java  | 117 +++
 .../com/amd/aparapi/sample/extension/fft.cl   | 737 ++++++++++++++++++
 2 files changed, 854 insertions(+)
 create mode 100644 samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
 create mode 100644 samples/extension/src/com/amd/aparapi/sample/extension/fft.cl

diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
new file mode 100644
index 00000000..4d57a709
--- /dev/null
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
@@ -0,0 +1,117 @@
+package com.amd.aparapi.sample.extension;
+
+import java.util.Arrays;
+
+import com.amd.aparapi.Device;
+import com.amd.aparapi.OpenCL;
+import com.amd.aparapi.OpenCLDevice;
+import com.amd.aparapi.Range;
+
+public class FFTExample{
+   @OpenCL.Resource("com/amd/aparapi/sample/extension/fft.cl") interface FFT extends OpenCL<FFT>{
+
+      public FFT forward(//
+            Range _range,//
+            @GlobalReadWrite("real") float[] real,//
+            @GlobalReadWrite("imaginary") float[] imaginary//
+      );
+   }
+
+   static void fft(float[] x, float[] y) {
+      short dir = 1;
+      long m = 10;
+      int n, i, i1, j, k, i2, l, l1, l2;
+      double c1, c2, tx, ty, t1, t2, u1, u2, z;
+
+      /* Calculate the number of points */
+      n = 1;
+      for (i = 0; i < m; i++)
+         n *= 2;
+
+      /* Do the bit reversal */
+      i2 = n >> 1;
+      j = 0;
+      for (i = 0; i < n - 1; i++) {
+         if (i < j) {
+            tx = x[i];
+            ty = y[i];
+            x[i] = x[j];
+            y[i] = y[j];
+            x[j] = (float) tx;
+            y[j] = (float) ty;
+         }
+         k = i2;
+         while (k <= j) {
+            j -= k;
+            k >>= 1;
+         }
+         j += k;
+      }
+
+      /* Compute the FFT */
+      c1 = -1.0;
+      c2 = 0.0;
+      l2 = 1;
+      for (l = 0; l < m; l++) {
+         l1 = l2;
+         l2 <<= 1;
+         u1 = 1.0;
+         u2 = 0.0;
+         for (j = 0; j < l1; j++) {
+            for (i = j; i < n; i += l2) {
+               i1 = i + l1;
+               t1 = u1 * x[i1] - u2 * y[i1];
+               t2 = u1 * y[i1] + u2 * x[i1];
+               x[i1] = (float) (x[i] - t1);
+               y[i1] = (float) (y[i] - t2);
+               x[i] += (float) t1;
+               y[i] += (float) t2;
+            }
+            z = u1 * c1 - u2 * c2;
+            u2 = u1 * c2 + u2 * c1;
+            u1 = z;
+         }
+         c2 = Math.sqrt((1.0 - c1) / 2.0);
+         if (dir == 1)
+            c2 = -c2;
+         c1 = Math.sqrt((1.0 + c1) / 2.0);
+      }
+
+      /* Scaling for forward transform */
+      /*if (dir == 1) {
+         for (i=0;i<n;i++) {
+            x[i] /= n;
+            y[i] /= n;
+         }
+      }*/
+
+   }
+
+   public static void main(String[] args) {
+      final int LEN = 1024;
+      float initial[] = new float[LEN];
+      float real[] = new float[LEN];
+      float imaginary[] = new float[LEN];
+      float referenceReal[] = Arrays.copyOf(real, real.length);
+      float referenceImaginary[] = Arrays.copyOf(imaginary, imaginary.length);
+      OpenCLDevice device = (OpenCLDevice) Device.best();
+      FFT fft = device.bind(FFT.class);
+      for (int i = 0; i < LEN; i++) {
+         initial[i] = real[i] = referenceReal[i] = (float) (Math.random() * 256);
+         imaginary[i] = referenceImaginary[0] = 0f;
+      }
+
+      Range range = device.createRange(64);
+      System.out.println("range=" + range);
+      long start = System.nanoTime();
+      fft.forward(range, real, imaginary);
+      System.out.println("opencl " + ((System.nanoTime() - start) / 1000000));
+      start = System.nanoTime();
+      fft(referenceReal, referenceImaginary);
+      System.out.println("java " + ((System.nanoTime() - start) / 1000000));
+      for (int i = 0; i < LEN; i++) {
+         System.out.printf("%d %5.2f %5.2f %5.2f\n", i, initial[i], real[i], referenceReal[i]);
+      }
+
+   }
+}
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/fft.cl b/samples/extension/src/com/amd/aparapi/sample/extension/fft.cl
new file mode 100644
index 00000000..f4085983
--- /dev/null
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/fft.cl
@@ -0,0 +1,737 @@
+/* ============================================================
+
+Copyright (c) 2009-2010 Advanced Micro Devices, Inc.  All rights reserved.
+ 
+Redistribution and use of this material is permitted under the following 
+conditions:
+ 
+Redistributions must retain the above copyright notice and all terms of this 
+license.
+ 
+In no event shall anyone redistributing or accessing or using this material 
+commence or participate in any arbitration or legal action relating to this 
+material against Advanced Micro Devices, Inc. or any copyright holders or 
+contributors. The foregoing shall survive any expiration or termination of 
+this license or any agreement or access or use related to this material. 
+
+ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION 
+OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL.
+
+THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT 
+HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY 
+REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO 
+SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE 
+FROM DEFECTS OR VIRUSES.  ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER 
+EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED 
+WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, 
+ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. 
+IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR 
+BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY 
+ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY 
+OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, 
+INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS 
+(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS 
+THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND 
+ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, 
+OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE 
+FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE 
+CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR 
+DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
+CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE 
+THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL 
+SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR 
+ACCESS OR USE RELATED TO THIS MATERIAL.
+
+NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS 
+MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO 
+RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER 
+COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH 
+AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS 
+DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. 
+MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED,
+EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, 
+INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, 
+COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. 
+MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY 
+LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL.
+
+NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is 
+provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to 
+computer software and technical data, respectively. Use, duplication, 
+distribution or disclosure by the U.S. Government and/or DOD agencies is 
+subject to the full extent of restrictions in all applicable regulations, 
+including those found at FAR52.227 and DFARS252.227 et seq. and any successor 
+regulations thereof. Use of this material by the U.S. Government and/or DOD 
+agencies is acknowledgment of the proprietary rights of any copyright holders 
+and contributors, including those of Advanced Micro Devices, Inc., as well as 
+the provisions of FAR52.227-14 through 23 regarding privately developed and/or 
+commercial computer software.
+
+This license forms the entire agreement regarding the subject matter hereof and 
+supersedes all proposals and prior discussions and writings between the parties 
+with respect thereto. This license does not affect any ownership, rights, title,
+or interest in, or relating to, this material. No terms of this license can be 
+modified or waived, and no breach of this license can be excused, unless done 
+so in a writing signed by all affected parties. Each term of this license is 
+separately enforceable. If any term of this license is determined to be or 
+becomes unenforceable or illegal, such term shall be reformed to the minimum 
+extent necessary in order for this license to remain in effect in accordance 
+with its terms as modified by such reformation. This license shall be governed 
+by and construed in accordance with the laws of the State of Texas without 
+regard to rules on conflicts of law of any state or jurisdiction or the United 
+Nations Convention on the International Sale of Goods. All disputes arising out 
+of this license shall be subject to the jurisdiction of the federal and state 
+courts in Austin, Texas, and all defenses are hereby waived concerning personal 
+jurisdiction and venue of these courts.
+
+============================================================ */
+
+
+// This is 2 PI / 1024
+#define ANGLE 0x1.921fb6p-8F
+
+// Return sin and cos of -2*pi*i/1024
+__attribute__((always_inline)) float
+k_sincos(int i, float *cretp)
+{
+    if (i > 512)
+	i -= 1024;
+
+    float x = i * -ANGLE;
+    *cretp = native_cos(x);
+    return native_sin(x);
+}
+
+__attribute__((always_inline)) float4
+k_sincos4(int4 i, float4 *cretp)
+{
+    i -= (i > 512) & 1024;
+    float4 x = convert_float4(i) * -ANGLE;
+    *cretp = native_cos(x);
+    return native_sin(x);
+}
+
+// Twiddle factor stuff
+#define TWGEN(I,C,S) \
+    float C; \
+    float S = k_sincos(tbase * I, &C)
+
+#define TW4GEN(I,C,S) \
+    float4 C; \
+    float4 S = k_sincos4(tbase * I, &C)
+
+#define TWAPPLY(ZR, ZI, C, S) \
+    do { \
+	float4 __r = C * ZR - S * ZI; \
+	ZI = C * ZI + S * ZR; \
+	ZR = __r; \
+    } while (0)
+
+# define TW4IDDLE4() \
+    do { \
+        TW4GEN(1, c1, s1); \
+        TWAPPLY(zr1, zi1, c1, s1); \
+        TW4GEN(2, c2, s2); \
+        TWAPPLY(zr2, zi2, c2, s2); \
+        TW4GEN(3, c3, s3); \
+        TWAPPLY(zr3, zi3, c3, s3); \
+    } while (0)
+    
+# define TWIDDLE4() \
+    do { \
+        TWGEN(1, c1, s1); \
+        TWAPPLY(zr1, zi1, c1, s1); \
+        TWGEN(2, c2, s2); \
+        TWAPPLY(zr2, zi2, c2, s2); \
+        TWGEN(3, c3, s3); \
+        TWAPPLY(zr3, zi3, c3, s3); \
+    } while (0)
+
+// 4 point FFT
+#define FFT4() \
+    do { \
+        float4 ar0 = zr0 + zr2; \
+        float4 ar2 = zr1 + zr3; \
+        float4 br0 = ar0 + ar2; \
+        float4 br1 = zr0 - zr2; \
+        float4 br2 = ar0 - ar2; \
+        float4 br3 = zr1 - zr3; \
+        float4 ai0 = zi0 + zi2; \
+        float4 ai2 = zi1 + zi3; \
+        float4 bi0 = ai0 + ai2; \
+        float4 bi1 = zi0 - zi2; \
+        float4 bi2 = ai0 - ai2; \
+        float4 bi3 = zi1 - zi3; \
+        zr0 = br0; \
+        zi0 = bi0; \
+        zr1 = br1 + bi3; \
+        zi1 = bi1 - br3; \
+        zr3 = br1 - bi3; \
+        zi3 = br3 + bi1; \
+        zr2 = br2; \
+        zi2 = bi2; \
+    } while (0)
+
+// First pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass1(uint me,
+	    const __global float *gr, const __global float *gi,
+	    __local float *lds)
+{
+    const __global float4 *gp;
+    __local float *lp;
+
+    // Pull in transform data
+    gp = (const __global float4 *)(gr + (me << 2));
+    float4 zr0 = gp[0*64];
+    float4 zr1 = gp[1*64];
+    float4 zr2 = gp[2*64];
+    float4 zr3 = gp[3*64];
+
+    gp = (const __global float4 *)(gi + (me << 2));
+    float4 zi0 = gp[0*64];
+    float4 zi1 = gp[1*64];
+    float4 zi2 = gp[2*64];
+    float4 zi3 = gp[3*64];
+
+    FFT4();
+
+    int4 tbase = (int)(me << 2) + (int4)(0, 1, 2, 3);
+    TW4IDDLE4();
+
+    // Save registers
+    // Note that this pointer is not aligned enough to be cast to a float4*
+    lp = lds + ((me << 2) + (me >> 3));
+
+    lp[0] = zr0.x;
+    lp[1] = zr0.y;
+    lp[2] = zr0.z;
+    lp[3] = zr0.w;
+    lp += 66*4;
+
+    lp[0] = zr1.x;
+    lp[1] = zr1.y;
+    lp[2] = zr1.z;
+    lp[3] = zr1.w;
+    lp += 66*4;
+
+    lp[0] = zr2.x;
+    lp[1] = zr2.y;
+    lp[2] = zr2.z;
+    lp[3] = zr2.w;
+    lp += 66*4;
+
+    lp[0] = zr3.x;
+    lp[1] = zr3.y;
+    lp[2] = zr3.z;
+    lp[3] = zr3.w;
+    lp += 66*4;
+
+    // Imaginary part
+    lp[0] = zi0.x;
+    lp[1] = zi0.y;
+    lp[2] = zi0.z;
+    lp[3] = zi0.w;
+    lp += 66*4;
+
+    lp[0] = zi1.x;
+    lp[1] = zi1.y;
+    lp[2] = zi1.z;
+    lp[3] = zi1.w;
+    lp += 66*4;
+
+    lp[0] = zi2.x;
+    lp[1] = zi2.y;
+    lp[2] = zi2.z;
+    lp[3] = zi2.w;
+    lp += 66*4;
+
+    lp[0] = zi3.x;
+    lp[1] = zi3.y;
+    lp[2] = zi3.z;
+    lp[3] = zi3.w;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Second pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass2(uint me, __local float *lds)
+{
+    __local float *lp;
+
+    // Load registers
+    lp = lds + (me + (me >> 5));
+
+    float4 zr0, zr1, zr2, zr3;
+
+    zr0.x = lp[0*66];
+    zr1.x = lp[1*66];
+    zr2.x = lp[2*66];
+    zr3.x = lp[3*66];
+    lp += 66*4;
+
+    zr0.y = lp[0*66];
+    zr1.y = lp[1*66];
+    zr2.y = lp[2*66];
+    zr3.y = lp[3*66];
+    lp += 66*4;
+
+    zr0.z = lp[0*66];
+    zr1.z = lp[1*66];
+    zr2.z = lp[2*66];
+    zr3.z = lp[3*66];
+    lp += 66*4;
+
+    zr0.w = lp[0*66];
+    zr1.w = lp[1*66];
+    zr2.w = lp[2*66];
+    zr3.w = lp[3*66];
+    lp += 66*4;
+
+    float4 zi0, zi1, zi2, zi3;
+
+    zi0.x = lp[0*66];
+    zi1.x = lp[1*66];
+    zi2.x = lp[2*66];
+    zi3.x = lp[3*66];
+    lp += 66*4;
+
+    zi0.y = lp[0*66];
+    zi1.y = lp[1*66];
+    zi2.y = lp[2*66];
+    zi3.y = lp[3*66];
+    lp += 66*4;
+
+    zi0.z = lp[0*66];
+    zi1.z = lp[1*66];
+    zi2.z = lp[2*66];
+    zi3.z = lp[3*66];
+    lp += 66*4;
+
+    zi0.w = lp[0*66];
+    zi1.w = lp[1*66];
+    zi2.w = lp[2*66];
+    zi3.w = lp[3*66];
+
+    // Transform and twiddle
+    FFT4();
+
+    int tbase = (int)(me << 2);
+    TWIDDLE4();
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Store registers
+    lp = lds + ((me << 2) + (me >> 3));
+
+    lp[0] = zr0.x;
+    lp[1] = zr1.x;
+    lp[2] = zr2.x;
+    lp[3] = zr3.x;
+    lp += 66*4;
+
+    lp[0] = zr0.y;
+    lp[1] = zr1.y;
+    lp[2] = zr2.y;
+    lp[3] = zr3.y;
+    lp += 66*4;
+
+    lp[0] = zr0.z;
+    lp[1] = zr1.z;
+    lp[2] = zr2.z;
+    lp[3] = zr3.z;
+    lp += 66*4;
+
+    lp[0] = zr0.w;
+    lp[1] = zr1.w;
+    lp[2] = zr2.w;
+    lp[3] = zr3.w;
+    lp += 66*4;
+
+    // Imaginary part
+    lp[0] = zi0.x;
+    lp[1] = zi1.x;
+    lp[2] = zi2.x;
+    lp[3] = zi3.x;
+    lp += 66*4;
+
+    lp[0] = zi0.y;
+    lp[1] = zi1.y;
+    lp[2] = zi2.y;
+    lp[3] = zi3.y;
+    lp += 66*4;
+
+    lp[0] = zi0.z;
+    lp[1] = zi1.z;
+    lp[2] = zi2.z;
+    lp[3] = zi3.z;
+    lp += 66*4;
+
+    lp[0] = zi0.w;
+    lp[1] = zi1.w;
+    lp[2] = zi2.w;
+    lp[3] = zi3.w;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Third pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass3(uint me, __local float *lds)
+{
+    __local float *lp;
+
+    // Load registers
+    lp = lds + (me + (me >> 5));
+
+    float4 zr0, zr1, zr2, zr3;
+
+    zr0.x = lp[0*66];
+    zr1.x = lp[1*66];
+    zr2.x = lp[2*66];
+    zr3.x = lp[3*66];
+    lp += 66*4;
+
+    zr0.y = lp[0*66];
+    zr1.y = lp[1*66];
+    zr2.y = lp[2*66];
+    zr3.y = lp[3*66];
+    lp += 66*4;
+
+    zr0.z = lp[0*66];
+    zr1.z = lp[1*66];
+    zr2.z = lp[2*66];
+    zr3.z = lp[3*66];
+    lp += 66*4;
+
+    zr0.w = lp[0*66];
+    zr1.w = lp[1*66];
+    zr2.w = lp[2*66];
+    zr3.w = lp[3*66];
+    lp += 66*4;
+
+    float4 zi0, zi1, zi2, zi3;
+
+    zi0.x = lp[0*66];
+    zi1.x = lp[1*66];
+    zi2.x = lp[2*66];
+    zi3.x = lp[3*66];
+    lp += 66*4;
+
+    zi0.y = lp[0*66];
+    zi1.y = lp[1*66];
+    zi2.y = lp[2*66];
+    zi3.y = lp[3*66];
+    lp += 66*4;
+
+    zi0.z = lp[0*66];
+    zi1.z = lp[1*66];
+    zi2.z = lp[2*66];
+    zi3.z = lp[3*66];
+    lp += 66*4;
+
+    zi0.w = lp[0*66];
+    zi1.w = lp[1*66];
+    zi2.w = lp[2*66];
+    zi3.w = lp[3*66];
+
+    // Transform and twiddle
+    FFT4();
+
+    int tbase = (int)((me >> 2) << 4);
+    TWIDDLE4();
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Save registers
+    lp = lds + me;
+
+    lp[0*66] = zr0.x;
+    lp[1*66] = zr0.y;
+    lp[2*66] = zr0.z;
+    lp[3*66] = zr0.w;
+    lp += 66*4;
+
+    lp[0*66] = zr1.x;
+    lp[1*66] = zr1.y;
+    lp[2*66] = zr1.z;
+    lp[3*66] = zr1.w;
+    lp += 66*4;
+
+    lp[0*66] = zr2.x;
+    lp[1*66] = zr2.y;
+    lp[2*66] = zr2.z;
+    lp[3*66] = zr2.w;
+    lp += 66*4;
+
+    lp[0*66] = zr3.x;
+    lp[1*66] = zr3.y;
+    lp[2*66] = zr3.z;
+    lp[3*66] = zr3.w;
+    lp += 66*4;
+
+    // Imaginary part
+    lp[0*66] = zi0.x;
+    lp[1*66] = zi0.y;
+    lp[2*66] = zi0.z;
+    lp[3*66] = zi0.w;
+    lp += 66*4;
+
+    lp[0*66] = zi1.x;
+    lp[1*66] = zi1.y;
+    lp[2*66] = zi1.z;
+    lp[3*66] = zi1.w;
+    lp += 66*4;
+
+    lp[0*66] = zi2.x;
+    lp[1*66] = zi2.y;
+    lp[2*66] = zi2.z;
+    lp[3*66] = zi2.w;
+    lp += 66*4;
+
+    lp[0*66] = zi3.x;
+    lp[1*66] = zi3.y;
+    lp[2*66] = zi3.z;
+    lp[3*66] = zi3.w;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Fourth pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass4(uint me, __local float *lds)
+{
+    __local float *lp;
+
+    // Load registers
+    lp = lds + ((me & 0x3) + ((me >> 2) & 0x3)*(66*4) + ((me >> 4) << 2));
+
+    float4 zr0, zr1, zr2, zr3;
+
+    zr0.x = lp[0*66];
+    zr0.y = lp[1*66];
+    zr0.z = lp[2*66];
+    zr0.w = lp[3*66];
+    lp += 16;
+
+    zr1.x = lp[0*66];
+    zr1.y = lp[1*66];
+    zr1.z = lp[2*66];
+    zr1.w = lp[3*66];
+    lp += 16;
+
+    zr2.x = lp[0*66];
+    zr2.y = lp[1*66];
+    zr2.z = lp[2*66];
+    zr2.w = lp[3*66];
+    lp += 16;
+
+    zr3.x = lp[0*66];
+    zr3.y = lp[1*66];
+    zr3.z = lp[2*66];
+    zr3.w = lp[3*66];
+    lp += 66*4*4 - 3*16;
+
+    float4 zi0, zi1, zi2, zi3;
+
+    zi0.x = lp[0*66];
+    zi0.y = lp[1*66];
+    zi0.z = lp[2*66];
+    zi0.w = lp[3*66];
+    lp += 16;
+
+    zi1.x = lp[0*66];
+    zi1.y = lp[1*66];
+    zi1.z = lp[2*66];
+    zi1.w = lp[3*66];
+    lp += 16;
+
+    zi2.x = lp[0*66];
+    zi2.y = lp[1*66];
+    zi2.z = lp[2*66];
+    zi2.w = lp[3*66];
+    lp += 16;
+
+    zi3.x = lp[0*66];
+    zi3.y = lp[1*66];
+    zi3.z = lp[2*66];
+    zi3.w = lp[3*66];
+
+    // Transform and twiddle
+    FFT4();
+
+    int tbase = (int)((me >> 4) << 6);
+    TWIDDLE4();
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Save registers in conflict free manner
+    lp = lds + me;
+
+    lp[0*68] = zr0.x;
+    lp[1*68] = zr0.y;
+    lp[2*68] = zr0.z;
+    lp[3*68] = zr0.w;
+    lp += 68*4;
+
+    lp[0*68] = zr1.x;
+    lp[1*68] = zr1.y;
+    lp[2*68] = zr1.z;
+    lp[3*68] = zr1.w;
+    lp += 68*4;
+
+    lp[0*68] = zr2.x;
+    lp[1*68] = zr2.y;
+    lp[2*68] = zr2.z;
+    lp[3*68] = zr2.w;
+    lp += 68*4;
+
+    lp[0*68] = zr3.x;
+    lp[1*68] = zr3.y;
+    lp[2*68] = zr3.z;
+    lp[3*68] = zr3.w;
+    lp += 68*4;
+
+    // Imaginary part
+    lp[0*68] = zi0.x;
+    lp[1*68] = zi0.y;
+    lp[2*68] = zi0.z;
+    lp[3*68] = zi0.w;
+    lp += 68*4;
+
+    lp[0*68] = zi1.x;
+    lp[1*68] = zi1.y;
+    lp[2*68] = zi1.z;
+    lp[3*68] = zi1.w;
+    lp += 68*4;
+
+    lp[0*68] = zi2.x;
+    lp[1*68] = zi2.y;
+    lp[2*68] = zi2.z;
+    lp[3*68] = zi2.w;
+    lp += 68*4;
+
+    lp[0*68] = zi3.x;
+    lp[1*68] = zi3.y;
+    lp[2*68] = zi3.z;
+    lp[3*68] = zi3.w;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Fifth and last pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass5(uint me,
+	   const __local float *lds,
+	   __global float *gr, __global float *gi)
+{
+    const __local float *lp;
+
+    // Load registers
+    lp = lds + ((me & 0xf) + (me >> 4)*(68*4));
+
+    float4 zr0, zr1, zr2, zr3;
+
+    zr0.x = lp[0*68];
+    zr0.y = lp[1*68];
+    zr0.z = lp[2*68];
+    zr0.w = lp[3*68];
+    lp += 16;
+
+    zr1.x = lp[0*68];
+    zr1.y = lp[1*68];
+    zr1.z = lp[2*68];
+    zr1.w = lp[3*68];
+    lp += 16;
+
+    zr2.x = lp[0*68];
+    zr2.y = lp[1*68];
+    zr2.z = lp[2*68];
+    zr2.w = lp[3*68];
+    lp += 16;
+
+    zr3.x = lp[0*68];
+    zr3.y = lp[1*68];
+    zr3.z = lp[2*68];
+    zr3.w = lp[3*68];
+
+    lp += 68*4*4 - 3*16;
+
+    float4 zi0, zi1, zi2, zi3;
+
+    zi0.x = lp[0*68];
+    zi0.y = lp[1*68];
+    zi0.z = lp[2*68];
+    zi0.w = lp[3*68];
+    lp += 16;
+
+    zi1.x = lp[0*68];
+    zi1.y = lp[1*68];
+    zi1.z = lp[2*68];
+    zi1.w = lp[3*68];
+    lp += 16;
+
+    zi2.x = lp[0*68];
+    zi2.y = lp[1*68];
+    zi2.z = lp[2*68];
+    zi2.w = lp[3*68];
+    lp += 16;
+
+    zi3.x = lp[0*68];
+    zi3.y = lp[1*68];
+    zi3.z = lp[2*68];
+    zi3.w = lp[3*68];
+
+    // Transform
+    FFT4();
+
+    // Save result
+    __global float4 *gp = (__global float4 *)(gr + (me << 2));
+    gp[0*64] = zr0;
+    gp[1*64] = zr1;
+    gp[2*64] = zr2;
+    gp[3*64] = zr3;
+
+    gp = (__global float4 *)(gi + (me << 2));
+    gp[0*64] = zi0;
+    gp[1*64] = zi1;
+    gp[2*64] = zi2;
+    gp[3*64] = zi3;
+}
+
+// Distance between first real element of successive 1K vectors
+// It must be >= 1024, and a multiple of 4
+#define VSTRIDE (1024+0)
+
+// Performs a 1K complex FFT with every 64 global ids.
+// Each vector is a multiple of VSTRIDE from the first
+// Number of global ids must be a multiple of 64, e.g. 1024*64
+//
+//   greal - pointer to input and output real part of data
+//   gimag - pointer to input and output imaginary part of data
+__kernel void
+forward(__global float *greal, __global float *gimag)
+{
+    // This is 8704 bytes
+    __local float lds[68*4*4*2];
+
+    __global float *gr;
+    __global float *gi;
+    uint gid = get_global_id(0);
+    uint me = gid & 0x3fU;
+    uint dg = (gid >> 6) * VSTRIDE;
+
+    gr = greal + dg;
+    gi = gimag + dg;
+
+    kfft_pass1(me, gr, gi, lds);
+    kfft_pass2(me, lds);
+    kfft_pass3(me, lds);
+    kfft_pass4(me, lds);
+    kfft_pass5(me, lds, gr, gi);
+}
+
-- 
GitLab