diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44bd859387edb83bd6ba9d4a2c4a0c5c2c7f797b..981c2e29a474a70324ef3b9292de7f722e967189 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## 1.8.0
 * Updated KernelManager to facilitate class extensions having constructors with non static parameters
+* Enable kernel profiling and execution simultaneously on multiple devices (multiple threads calling same kernel class on multiple devices)
 
 ## 1.7.0
 
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f0959e8c337122f4c6b9d2634b7025db014a94f2..84040ab351b91d7d553733e267789bf4fcec7a37 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -48,4 +48,5 @@ Below are some of the specific details of various contributions.
 * Luis Mendes submited PR for issue #84 - Fully support OpenCL 1.2 barrier() - localBarrier(),  globalBarrier() and localGlobalBarrier()
 * Luis Mendes with suggestions by Automenta submited PR for issue #62 and implemented new thread-safe API for Kernel profiling
 * Luis Mendes submited PR for issue #101 - Possible deadlock in JTP mode
-* Luis Mendes submited PR to facilitate KernelManager class extension with non-static parameters in constructors
\ No newline at end of file
+* Luis Mendes submited PR to facilitate KernelManager class extension with non-static parameters in constructors
+* Luis Mendes submited PR to Enable kernel profiling and execution simultaneously on multiple devices
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 1d1d5150d7ce15d391baa9b6e28fb806272654bf..122f5949406867ae2b71de50370b33599bcc22c1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -87,7 +87,7 @@
         <dependency>
             <groupId>com.aparapi</groupId>
             <artifactId>aparapi-jni</artifactId>
-            <version>1.1.2</version>
+            <version>1.2.0</version>
         </dependency>
         <dependency>
             <groupId>junit</groupId>
diff --git a/src/main/java/com/aparapi/device/Device.java b/src/main/java/com/aparapi/device/Device.java
index f15835e6eb9a5452a5f2a22e73828048ba1a03d6..2e43c43c718958bf7359ad36fde02dbc3f7a358b 100644
--- a/src/main/java/com/aparapi/device/Device.java
+++ b/src/main/java/com/aparapi/device/Device.java
@@ -18,7 +18,7 @@ package com.aparapi.device;
 import com.aparapi.*;
 import com.aparapi.internal.kernel.*;
 
-public abstract class Device{
+public abstract class Device implements Comparable<Device> {
 
    public static enum TYPE {
       UNKNOWN(Integer.MAX_VALUE),
@@ -179,4 +179,20 @@ public abstract class Device{
    public int hashCode() {
       return Long.valueOf(getDeviceId()).hashCode();
    }
+   
+   public int compareTo(Device other) {
+	   if (type.rank < other.type.rank) {
+		   return -1;
+	   } else if (type.rank > other.type.rank) {
+		   return 1;
+	   }
+	   
+	   if (getDeviceId() < other.getDeviceId()) {
+		   return -1;
+	   } else if (getDeviceId() > other.getDeviceId()) {
+		   return 1;
+	   }
+	   
+	   return 0;
+   }
 }
diff --git a/src/main/java/com/aparapi/device/JavaDevice.java b/src/main/java/com/aparapi/device/JavaDevice.java
index 4b687dd6b48faad5e319e0ee1781b94929d40c74..a392cdf162579f1a51d23f0e5d815600f1a3c2e3 100644
--- a/src/main/java/com/aparapi/device/JavaDevice.java
+++ b/src/main/java/com/aparapi/device/JavaDevice.java
@@ -15,7 +15,7 @@
  */
 package com.aparapi.device;
 
-public class JavaDevice extends Device {
+public class JavaDevice extends Device implements Comparable<Device> {
 
    public static final JavaDevice THREAD_POOL = new JavaDevice(TYPE.JTP, "Java Thread Pool", -3);
    public static final JavaDevice ALTERNATIVE_ALGORITHM = new JavaDevice(TYPE.ALT, "Java Alternative Algorithm", -2);
diff --git a/src/main/java/com/aparapi/device/OpenCLDevice.java b/src/main/java/com/aparapi/device/OpenCLDevice.java
index 5288e38172b8275225a6abb1225dc8abf7fec3ac..a105540e54ba508c8dac8e79e5bfc7424d847de4 100644
--- a/src/main/java/com/aparapi/device/OpenCLDevice.java
+++ b/src/main/java/com/aparapi/device/OpenCLDevice.java
@@ -45,7 +45,7 @@ import com.aparapi.opencl.OpenCL.Local;
 import com.aparapi.opencl.OpenCL.Resource;
 import com.aparapi.opencl.OpenCL.Source;
 
-public class OpenCLDevice extends Device{
+public class OpenCLDevice extends Device implements Comparable<Device> {
 
    private final OpenCLPlatform platform;
 
diff --git a/src/main/java/com/aparapi/internal/kernel/KernelManager.java b/src/main/java/com/aparapi/internal/kernel/KernelManager.java
index 7640a9af9785f383fb0d283f7da6c9c163a236d3..35a20e2c3a67d7e66b6672c2647e413dcec5738b 100644
--- a/src/main/java/com/aparapi/internal/kernel/KernelManager.java
+++ b/src/main/java/com/aparapi/internal/kernel/KernelManager.java
@@ -48,7 +48,7 @@ public class KernelManager {
 
    /**
     * Default KernelManager initialization.<br/>
-    * Convenience method for being overriden to an empty implementation, so that derived 
+    * Convenience method for being overridden to an empty implementation, so that derived 
     * KernelManager classes can provide non static parameters to their constructors.
     */
    protected void setup() {
diff --git a/src/main/java/com/aparapi/internal/kernel/KernelPreferences.java b/src/main/java/com/aparapi/internal/kernel/KernelPreferences.java
index 7c553390fde4758450579a62f4d1f62817b74882..28f307dcf0ceca674e486949e5d22daae56b0dc9 100644
--- a/src/main/java/com/aparapi/internal/kernel/KernelPreferences.java
+++ b/src/main/java/com/aparapi/internal/kernel/KernelPreferences.java
@@ -19,11 +19,15 @@ import com.aparapi.*;
 import com.aparapi.device.*;
 
 import java.util.*;
+import java.util.concurrent.atomic.AtomicReference;
 
+/**
+ * Thread safe class holding the kernel preferences for a given kernel class.
+ */
 public class KernelPreferences {
    private final Class<? extends Kernel> kernelClass;
    private final KernelManager manager;
-   private volatile LinkedList<Device> preferredDevices = null;
+   private final AtomicReference<LinkedList<Device>> preferredDevices = new AtomicReference<>(null);
    private final LinkedHashSet<Device> failedDevices = new LinkedHashSet<>();
 
    public KernelPreferences(KernelManager manager, Class<? extends Kernel> kernelClass) {
@@ -39,14 +43,16 @@ public class KernelPreferences {
    public List<Device> getPreferredDevices(Kernel kernel) {
       maybeSetUpDefaultPreferredDevices();
 
+      ArrayList<Device> copy;
+      synchronized (this) {
+         copy = new ArrayList<>(preferredDevices.get());
+      }
+
       if (kernel == null) {
-         return Collections.unmodifiableList(preferredDevices);
+         return Collections.unmodifiableList(copy);
       }
+      
       List<Device> localPreferredDevices = new ArrayList<>();
-      ArrayList<Device> copy;
-      synchronized (preferredDevices) {
-         copy = new ArrayList(preferredDevices);
-      }
       for (Device device : copy) {
          if (kernel.isAllowDevice(device)) {
             localPreferredDevices.add(device);
@@ -56,12 +62,12 @@ public class KernelPreferences {
    }
 
    synchronized void setPreferredDevices(LinkedHashSet<Device> _preferredDevices) {
-      if (preferredDevices != null) {
-         preferredDevices.clear();
-         preferredDevices.addAll(_preferredDevices);
+      if (preferredDevices.get() != null) {
+         preferredDevices.get().clear();
+         preferredDevices.get().addAll(_preferredDevices);
       }
       else {
-         preferredDevices = new LinkedList<>(_preferredDevices);
+         preferredDevices.set(new LinkedList<>(_preferredDevices));
       }
       failedDevices.clear();
    }
@@ -72,22 +78,18 @@ public class KernelPreferences {
    }
 
    synchronized void markPreferredDeviceFailed() {
-      if (preferredDevices.size() > 0) {
-         failedDevices.add(preferredDevices.remove(0));
+      if (preferredDevices.get().size() > 0) {
+         failedDevices.add(preferredDevices.get().remove(0));
       }
    }
 
    private void maybeSetUpDefaultPreferredDevices() {
-      if (preferredDevices == null) {
-         synchronized (this) {
-            if (preferredDevices == null) {
-               preferredDevices = new LinkedList<>(manager.getDefaultPreferences().getPreferredDevices(null));
-            }
-         }
-      }
+	   if (preferredDevices.get() == null) {
+		   preferredDevices.compareAndSet(null, new LinkedList<>(manager.getDefaultPreferences().getPreferredDevices(null)));
+	   }
    }
 
-   public List<Device> getFailedDevices() {
+   public synchronized List<Device> getFailedDevices() {
       return new ArrayList<>(failedDevices);
    }
 }
diff --git a/src/main/java/com/aparapi/internal/kernel/KernelProfile.java b/src/main/java/com/aparapi/internal/kernel/KernelProfile.java
index 6803d8850b4ee7792422091e6c1a2f8424001240..933b3f2bb7d3a79c2c31037674963f1109d76b75 100644
--- a/src/main/java/com/aparapi/internal/kernel/KernelProfile.java
+++ b/src/main/java/com/aparapi/internal/kernel/KernelProfile.java
@@ -19,23 +19,22 @@ import com.aparapi.*;
 import com.aparapi.device.*;
 
 import java.util.*;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.logging.*;
 
 /**
- * Collects profiling information per kernel class per device. Not thread safe, it is necessary for client code to correctly synchronize on
- * objects of this class.
+ * Collects profiling information per kernel class per device.
  */
 public class KernelProfile {
 
    public static final double MILLION = 1000000d;
    private static Logger logger = Logger.getLogger(Config.getLoggerName());
    private final Class<? extends Kernel> kernelClass;
-   private LinkedHashMap<Device, KernelDeviceProfile> deviceProfiles = new LinkedHashMap<>();
-   private Device currentDevice;
-   private Device lastDevice;
-   private KernelDeviceProfile currentDeviceProfile;
+   private ConcurrentSkipListMap<Device, KernelDeviceProfile> deviceProfiles = new ConcurrentSkipListMap<>();
+   private final AtomicReference<Device> currentDevice = new AtomicReference<Device>(null);
    private IProfileReportObserver observer;
-
+   
    public KernelProfile(Class<? extends Kernel> _kernelClass) {
       kernelClass = _kernelClass;
    }
@@ -60,24 +59,47 @@ public class KernelProfile {
       }
    }
 
+   /**
+    * Retrieves the last device profile that was updated by the last thread that made 
+    * a profiling information update, when executing this kernel on the specified device.
+    * @return the device profile 
+    */
    public KernelDeviceProfile getLastDeviceProfile() {
-      return deviceProfiles.get(currentDevice);
+      return deviceProfiles.get(currentDevice.get());
    }
 
+   /**
+    * Starts a profiling information gathering sequence for the current thread invoking this method
+    * regarding the specified execution device.
+    * @param device
+    */
    void onStart(Device device) {
-      synchronized (deviceProfiles) {
-         currentDeviceProfile = deviceProfiles.get(device);
-         if (currentDeviceProfile == null) {
-            currentDeviceProfile = new KernelDeviceProfile(this, kernelClass, device);
-            deviceProfiles.put(device, currentDeviceProfile);
+	  KernelDeviceProfile currentDeviceProfile = deviceProfiles.get(device);
+      if (currentDeviceProfile == null) {    	 
+         currentDeviceProfile = new KernelDeviceProfile(this, kernelClass, device);
+         KernelDeviceProfile existingProfile = deviceProfiles.putIfAbsent(device, currentDeviceProfile);
+         if (existingProfile != null) {
+        	 currentDeviceProfile = existingProfile;
          }
       }
       
       currentDeviceProfile.onEvent(ProfilingEvent.START);
-      currentDevice = device;
+      currentDevice.set(device);
    }
 
-   void onEvent(ProfilingEvent event) {
+   /**
+    * Updates the profiling information for the current thread invoking this method regarding
+    * the specified execution device.
+    * 
+    * @param device the device where the kernel is/was executed
+    * @param event the event for which the profiling information is being updated
+    */
+   void onEvent(Device device, ProfilingEvent event) {
+	  if (event == null) {
+		  logger.log(Level.WARNING, "Discarding profiling event " + event + " for null device, for Kernel class: " + kernelClass.getName());
+		  return;
+	  }
+	  final KernelDeviceProfile deviceProfile = deviceProfiles.get(device);
       switch (event) {
          case CLASS_MODEL_BUILT: // fallthrough
          case OPENCL_GENERATED:  // fallthrough
@@ -86,10 +108,10 @@ public class KernelProfile {
          case PREPARE_EXECUTE:   // fallthrough
          case EXECUTED:          // fallthrough
          {
-            if (currentDeviceProfile == null) {
+            if (deviceProfile == null) {
                logger.log(Level.SEVERE, "Error in KernelProfile, no currentDevice (synchronization error?");
             }
-            currentDeviceProfile.onEvent(event);
+            deviceProfile.onEvent(event);
             break;
          }
          case START:
@@ -99,16 +121,6 @@ public class KernelProfile {
       }
    }
 
-   void onFinishedExecution() {
-      reset();
-   }
-
-   private void reset() {
-      lastDevice = currentDevice;
-      currentDevice = null;
-      currentDeviceProfile = null;
-   }
-
    public Collection<Device> getDevices() {
       return deviceProfiles.keySet();
    }
diff --git a/src/main/java/com/aparapi/internal/kernel/KernelRunner.java b/src/main/java/com/aparapi/internal/kernel/KernelRunner.java
index bc324d17f78fe902bc5e801ba4ccb5d8319b18a3..58960925a578a0d2f327b2208178214f90754280 100644
--- a/src/main/java/com/aparapi/internal/kernel/KernelRunner.java
+++ b/src/main/java/com/aparapi/internal/kernel/KernelRunner.java
@@ -366,7 +366,7 @@ public class KernelRunner extends KernelRunnerJNI{
       boolean legacySequentialMode = kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.SEQ);
 
       passId = PASS_ID_PREPARING_EXECUTION;
-      _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
+      _settings.profile.onEvent(device, ProfilingEvent.PREPARE_EXECUTE);
 
       try {
          if (device == JavaDevice.ALTERNATIVE_ALGORITHM) {
@@ -376,7 +376,7 @@ public class KernelRunner extends KernelRunnerJNI{
                }
             } else {
                boolean silently = true; // not having an alternative algorithm is the normal state, and does not need reporting
-               fallBackToNextDevice(_settings, (Exception) null, silently);
+               fallBackToNextDevice(device, _settings, (Exception) null, silently);
             }
          } else {
             final int localSize0 = _settings.range.getLocalSize(0);
@@ -1214,7 +1214,7 @@ public class KernelRunner extends KernelRunnerJNI{
    }
 
    @SuppressWarnings("deprecation")
-   private Kernel executeOpenCL(ExecutionSettings _settings) throws AparapiException {
+   private Kernel executeOpenCL(Device device, ExecutionSettings _settings) throws AparapiException {
 
       // Read the array refs after kernel may have changed them
       // We need to do this as input to computing the localSize
@@ -1228,7 +1228,7 @@ public class KernelRunner extends KernelRunnerJNI{
       int returnValue = runKernelJNI(jniContextHandle, _settings.range, needSync, _settings.passes, inBufferRemote, outBufferRemote);
       if (returnValue != 0) {
          String reason = "OpenCL execution seems to have failed (runKernelJNI returned " + returnValue + ")";
-         return fallBackToNextDevice(_settings, new AparapiException(reason));
+         return fallBackToNextDevice(device, _settings, new AparapiException(reason));
       }
 
       if (usesOopConversion == true) {
@@ -1282,19 +1282,19 @@ public class KernelRunner extends KernelRunnerJNI{
       }
    }
 
-   private Kernel fallBackToNextDevice(ExecutionSettings _settings, String _reason) {
-      return fallBackToNextDevice(_settings, new AparapiException(_reason));
+   private Kernel fallBackToNextDevice(Device device, ExecutionSettings _settings, String _reason) {
+      return fallBackToNextDevice(device, _settings, new AparapiException(_reason));
    }
 
    @SuppressWarnings("deprecation")
-   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception) {
-      return fallBackToNextDevice(_settings, _exception, false);
+   synchronized private Kernel fallBackToNextDevice(Device device, ExecutionSettings _settings, Exception _exception) {
+      return fallBackToNextDevice(device, _settings, _exception, false);
    }
 
    @SuppressWarnings("deprecation")
-   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception, boolean _silently) {
+   synchronized private Kernel fallBackToNextDevice(Device device, ExecutionSettings _settings, Exception _exception, boolean _silently) {
       isFallBack = true;
-      _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+      _settings.profile.onEvent(device, ProfilingEvent.EXECUTED);
       if (_settings.legacyExecutionMode) {
          if (!_silently && logger.isLoggable(Level.WARNING)) {
             logger.warning("Execution mode " + kernel.getExecutionMode() + " failed for " + kernel + ": " + _exception.getMessage());
@@ -1307,6 +1307,8 @@ public class KernelRunner extends KernelRunnerJNI{
             logger.warning("Device failed for " + kernel + ": " + _exception.getMessage());
          }
 
+         //This method is synchronized thus ensuring thread safety on concurrent executions of the same kernel class,
+         //since preferences is shared between such threads.
          preferences.markPreferredDeviceFailed();
 
 //         Device nextDevice = preferences.getPreferredDevice(kernel);
@@ -1337,11 +1339,7 @@ public class KernelRunner extends KernelRunnerJNI{
          boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO;
 
          ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode);
-         // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates
-         // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously.
-         synchronized (preferences) {
-            return executeInternalOuter(settings);
-         }
+         return executeInternalOuter(settings);
       } finally {
          executing = false;
          clearCancelMultiPass();
@@ -1368,7 +1366,7 @@ public class KernelRunner extends KernelRunnerJNI{
       EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode();
 
       if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) {
-         fallBackToNextDevice(_settings, "OpenCL EXECUTION_MODE was requested but Device supplied was not an OpenCLDevice");
+         fallBackToNextDevice(_settings.range.getDevice(), _settings, "OpenCL EXECUTION_MODE was requested but Device supplied was not an OpenCLDevice");
       }
 
       Device device = _settings.range.getDevice();
@@ -1407,20 +1405,20 @@ public class KernelRunner extends KernelRunnerJNI{
                openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestGPU();
                jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
                if (openCLDevice == null) {
-                  return fallBackToNextDevice(_settings, "GPU request can't be honored, no GPU device");
+                  return fallBackToNextDevice(null, _settings, "GPU request can't be honored, no GPU device");
                }
             } else if (requestedExecutionMode.equals(EXECUTION_MODE.ACC)) {
                // Get the best ACC
                openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestACC();
                jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
                if (openCLDevice == null) {
-                  return fallBackToNextDevice(_settings, "ACC request can't be honored, no ACC device");
+                  return fallBackToNextDevice(null, _settings, "ACC request can't be honored, no ACC device");
                }
             } else {
                // We fetch the first CPU device
                openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.firstDevice(Device.TYPE.CPU);
                if (openCLDevice == null) {
-                  return fallBackToNextDevice(_settings, "CPU request can't be honored, no CPU device");
+                  return fallBackToNextDevice(null, _settings, "CPU request can't be honored, no CPU device");
                }
             }
          } else {
@@ -1443,10 +1441,10 @@ public class KernelRunner extends KernelRunnerJNI{
                   try {
                      final ClassModel classModel = ClassModel.createClassModel(kernel.getClass());
                      entryPoint = classModel.getEntrypoint(_settings.entrypoint, kernel);
-                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
+                     _settings.profile.onEvent(device, ProfilingEvent.CLASS_MODEL_BUILT);
                   } catch (final Exception exception) {
-                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
-                     return fallBackToNextDevice(_settings, exception);
+                     _settings.profile.onEvent(device, ProfilingEvent.CLASS_MODEL_BUILT);
+                     return fallBackToNextDevice(device, _settings, exception);
                   }
                }
 
@@ -1461,11 +1459,11 @@ public class KernelRunner extends KernelRunnerJNI{
                      // Init the device to check capabilities before emitting the
                      // code that requires the capabilities.
                      jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
-                     _settings.profile.onEvent(ProfilingEvent.INIT_JNI);
+                     _settings.profile.onEvent(device, ProfilingEvent.INIT_JNI);
                   } // end of synchronized! issue 68
 
                   if (jniContextHandle == 0) {
-                     return fallBackToNextDevice(_settings, "initJNI failed to return a valid handle");
+                     return fallBackToNextDevice(device, _settings, "initJNI failed to return a valid handle");
                   }
 
                   final String extensions = getExtensionsJNI(jniContextHandle);
@@ -1481,11 +1479,11 @@ public class KernelRunner extends KernelRunnerJNI{
                   }
 
                   if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
-                     return fallBackToNextDevice(_settings, "FP64 required but not supported");
+                     return fallBackToNextDevice(device, _settings, "FP64 required but not supported");
                   }
 
                   if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
-                     return fallBackToNextDevice(_settings, "Byte addressable stores required but not supported");
+                     return fallBackToNextDevice(device, _settings, "Byte addressable stores required but not supported");
                   }
 
                   final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport()
@@ -1494,7 +1492,7 @@ public class KernelRunner extends KernelRunnerJNI{
 
                   if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
 
-                     return fallBackToNextDevice(_settings, "32 bit Atomics required but not supported");
+                     return fallBackToNextDevice(device, _settings, "32 bit Atomics required but not supported");
                   }
 
                   String openCL;
@@ -1509,20 +1507,20 @@ public class KernelRunner extends KernelRunnerJNI{
                            else if (Config.enableShowGeneratedOpenCL) {
                               System.out.println(openCL);
                            }
-                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED);
                            openCLCache.put(kernel.getClass(), openCL);
                         }
                         catch (final CodeGenException codeGenException) {
                            openCLCache.put(kernel.getClass(), CODE_GEN_ERROR_MARKER);
-                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
-                           return fallBackToNextDevice(_settings, codeGenException);
+                           _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED);
+                           return fallBackToNextDevice(device, _settings, codeGenException);
                         }
                      }
                      else {
                         if (openCL.equals(CODE_GEN_ERROR_MARKER)) {
-                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED);
                            boolean silently = true; // since we must have already reported the CodeGenException
-                           return fallBackToNextDevice(_settings, null, silently);
+                           return fallBackToNextDevice(device, _settings, null, silently);
                         }
                      }
                   }
@@ -1547,9 +1545,9 @@ public class KernelRunner extends KernelRunnerJNI{
                         }
                      }
                   }
-                  _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED);
+                  _settings.profile.onEvent(device, ProfilingEvent.OPENCL_COMPILED);
                   if (handle == 0) {
-                     return fallBackToNextDevice(_settings, "OpenCL compile failed");
+                     return fallBackToNextDevice(device, _settings, "OpenCL compile failed");
                   }
                   
                   args = new KernelArg[entryPoint.getReferencedFields().size()];
@@ -1600,7 +1598,7 @@ public class KernelRunner extends KernelRunnerJNI{
                               try {
                                  setMultiArrayType(args[i], type);
                               } catch (AparapiException e) {
-                                 return fallBackToNextDevice(_settings, "failed to set kernel arguement "
+                                 return fallBackToNextDevice(device, _settings, "failed to set kernel arguement "
                                        + args[i].getName() + ".  Aparapi only supports 2D and 3D arrays.");
                               }
                            } else {
@@ -1677,27 +1675,27 @@ public class KernelRunner extends KernelRunnerJNI{
                   argc = i;
 
                   setArgsJNI(jniContextHandle, args, argc);
-                  _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
+                  _settings.profile.onEvent(device, ProfilingEvent.PREPARE_EXECUTE);
                   try {
-                     executeOpenCL(_settings);
+                     executeOpenCL(device, _settings);
                      isFallBack = false;
                   } catch (final AparapiException e) {
-                     fallBackToNextDevice(_settings, e);
+                     fallBackToNextDevice(device, _settings, e);
                   }
                } else { // (entryPoint != null) && !entryPoint.shouldFallback()
-                  fallBackToNextDevice(_settings, "failed to locate entrypoint");
+                  fallBackToNextDevice(device, _settings, "failed to locate entrypoint");
                }
             } else { // (entryPoint == null) || (isFallBack)
                try {
-                  executeOpenCL(_settings);
+                  executeOpenCL(device, _settings);
                   isFallBack = false;
                } catch (final AparapiException e) {
-                  fallBackToNextDevice(_settings, e);
+                  fallBackToNextDevice(device, _settings, e);
                }
             }
          } else { // isOpenCL
             if (!(device instanceof JavaDevice)) {
-               fallBackToNextDevice(_settings, "Non-OpenCL Kernel.EXECUTION_MODE requested but device is not a JavaDevice ");
+               fallBackToNextDevice(device, _settings, "Non-OpenCL Kernel.EXECUTION_MODE requested but device is not a JavaDevice ");
             }
             executeJava(_settings, (JavaDevice) device);
          }
@@ -1709,7 +1707,7 @@ public class KernelRunner extends KernelRunnerJNI{
          return kernel;
       }
       finally {
-         _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+         _settings.profile.onEvent(device, ProfilingEvent.EXECUTED);
          maybeReportProfile(_settings);
       }
    }