diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java
index 7bccfd2b4776d92302adcd381e2795c8de4d0ce0..0fb8e8d771e569a0868de5d98ec92d2a94e245fc 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java
@@ -1356,255 +1356,263 @@ class KernelRunner{
    synchronized Kernel execute(String _entrypointName, final Range _range, final int _passes) {
 
       long executeStartTime = System.currentTimeMillis();
+      
       if (_range == null) {
          throw new IllegalStateException("range can't be null");
       }
-
-      /* for backward compatibility reasons we still honor execution mode, but only if the Range *does not* contain a device specification */
-      Device device = _range.getDevice();
-      if ((kernel.getExecutionMode().isOpenCL() && device == null) || (device instanceof OpenCLDevice)) {
+      
+      /* for backward compatibility reasons we still honor execution mode */
+      if (kernel.getExecutionMode().isOpenCL()) {
          // System.out.println("OpenCL");
 
-         if (entryPoint == null) {
-            try {
-               ClassModel classModel = new ClassModel(kernel.getClass());
-               entryPoint = classModel.getEntrypoint(_entrypointName, kernel);
-            } catch (Exception exception) {
-
-               return warnFallBackAndExecute(_entrypointName, _range, _passes, exception);
-            }
-            if ((entryPoint != null) && !entryPoint.shouldFallback()) {
-               synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68
-                  if (device != null && !(device instanceof OpenCLDevice)) {
-                     throw new IllegalStateException("range's device is not suitable for OpenCL ");
-                  }
-                  OpenCLDevice openCLDevice = (OpenCLDevice) device; // still might be null! 
-
-                  int jniFlags = 0;
-                  if (openCLDevice == null) {
-                     if (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU)) {
-                        // We used to treat as before by getting first GPU device
-                        // now we get the best GPU
-                        openCLDevice = (OpenCLDevice) OpenCLDevice.best();
-                        jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. 
-                     } else {
-                        // We fetch the first CPU device 
-                        openCLDevice = (OpenCLDevice) OpenCLDevice.firstCPU();
-                        if (openCLDevice == null) {
-                           return warnFallBackAndExecute(_entrypointName, _range, _passes,
-                                 "CPU request can't be honored not CPU device");
-                        }
-                     }
-                  } else {
-                     if (openCLDevice.getType() == Device.TYPE.GPU) {
-                        jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. 
-                     }
-                  }
-
-                  //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
-                  //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
-                  //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
-                  // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
-                  // jniFlags |= (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
-                  // Init the device to check capabilities before emitting the
-                  // code that requires the capabilities.
-
-                  // synchronized(Kernel.class){
-                  jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
-               } // end of synchronized! issue 68
-               if (jniContextHandle == 0) {
-                  return warnFallBackAndExecute(_entrypointName, _range, _passes, "initJNI failed to return a valid handle");
-               }
-
-               String extensions = getExtensionsJNI(jniContextHandle);
-               capabilitiesSet = new HashSet<String>();
-               StringTokenizer strTok = new StringTokenizer(extensions);
-               while (strTok.hasMoreTokens()) {
-                  capabilitiesSet.add(strTok.nextToken());
-               }
-               if (logger.isLoggable(Level.FINE)) {
-                  logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
-               }
-
-               if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
-
-                  return warnFallBackAndExecute(_entrypointName, _range, _passes, "FP64 required but not supported");
-               }
-
-               if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
-
-                  return warnFallBackAndExecute(_entrypointName, _range, _passes,
-                        "Byte addressable stores required but not supported");
-               }
-
-               boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport() && hasGlobalInt32ExtendedAtomicsSupport()
-                     && hasLocalInt32BaseAtomicsSupport() && hasLocalInt32ExtendedAtomicsSupport();
-
-               if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
-
-                  return warnFallBackAndExecute(_entrypointName, _range, _passes, "32 bit Atomics required but not supported");
-               }
-
-               String openCL = null;
+         // See if user supplied a Device
+         Device device = _range.getDevice();
+            
+         if ((device == null) || (device instanceof OpenCLDevice)) {
+            if (entryPoint == null) {
                try {
-                  openCL = KernelWriter.writeToString(entryPoint);
-               } catch (CodeGenException codeGenException) {
-                  return warnFallBackAndExecute(_entrypointName, _range, _passes, codeGenException);
-               }
-
-               if (Config.enableShowGeneratedOpenCL) {
-                  System.out.println(openCL);
+                  ClassModel classModel = new ClassModel(kernel.getClass());
+                  entryPoint = classModel.getEntrypoint(_entrypointName, kernel);
+               } catch (Exception exception) {
+                  return warnFallBackAndExecute(_entrypointName, _range, _passes, exception);
                }
-               if (logger.isLoggable(Level.INFO)) {
-                  logger.info(openCL);
-               }
-
-               // Send the string to OpenCL to compile it
-               if (buildProgramJNI(jniContextHandle, openCL) == 0) {
-                  return warnFallBackAndExecute(_entrypointName, _range, _passes, "OpenCL compile failed");
-               }
-
-               args = new KernelArg[entryPoint.getReferencedFields().size()];
-               int i = 0;
-
-               for (Field field : entryPoint.getReferencedFields()) {
-                  try {
-                     field.setAccessible(true);
-                     args[i] = new KernelArg();
-                     args[i].name = field.getName();
-                     args[i].field = field;
-                     if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
-                        args[i].type |= ARG_STATIC;
+               
+               if ((entryPoint != null) && !entryPoint.shouldFallback()) {
+                  synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68
+                     if (device != null && !(device instanceof OpenCLDevice)) {
+                        throw new IllegalStateException("range's device is not suitable for OpenCL ");
                      }
-
-                     Class<?> type = field.getType();
-                     if (type.isArray()) {
-
-                        if (field.getAnnotation(com.amd.aparapi.Kernel.Local.class) != null
-                              || args[i].name.endsWith(Kernel.LOCAL_SUFFIX)) {
-                           args[i].type |= ARG_LOCAL;
-                        } else if (field.getAnnotation(com.amd.aparapi.Kernel.Constant.class) != null
-                              || args[i].name.endsWith(Kernel.CONSTANT_SUFFIX)) {
-                           args[i].type |= ARG_CONSTANT;
+                     
+                     OpenCLDevice openCLDevice = (OpenCLDevice) device; // still might be null! 
+   
+                     int jniFlags = 0;
+                     if (openCLDevice == null) {
+                        if (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU)) {
+                           // We used to treat as before by getting first GPU device
+                           // now we get the best GPU
+                           openCLDevice = (OpenCLDevice) OpenCLDevice.best();
+                           jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. 
                         } else {
-                           args[i].type |= ARG_GLOBAL;
+                           // We fetch the first CPU device 
+                           openCLDevice = (OpenCLDevice) OpenCLDevice.firstCPU();
+                           if (openCLDevice == null) {
+                              return warnFallBackAndExecute(_entrypointName, _range, _passes,
+                                    "CPU request can't be honored not CPU device");
+                           }
                         }
-                        args[i].array = null; // will get updated in updateKernelArrayRefs
-                        args[i].type |= ARG_ARRAY;
-                        if (isExplicit()) {
-                           args[i].type |= ARG_EXPLICIT;
+                     } else {
+                        if (openCLDevice.getType() == Device.TYPE.GPU) {
+                           jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. 
                         }
-                        // for now, treat all write arrays as read-write, see bugzilla issue 4859
-                        // we might come up with a better solution later
-                        args[i].type |= entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ)
-                              : 0;
-                        args[i].type |= entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0;
-                        // args[i].type |= ARG_GLOBAL;
-                        args[i].type |= type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0;
-
-                        args[i].type |= type.isAssignableFrom(int[].class) ? ARG_INT : 0;
-
-                        args[i].type |= type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0;
-
-                        args[i].type |= type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0;
-
-                        args[i].type |= type.isAssignableFrom(char[].class) ? ARG_CHAR : 0;
-
-                        args[i].type |= type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0;
-
-                        args[i].type |= type.isAssignableFrom(long[].class) ? ARG_LONG : 0;
-
-                        args[i].type |= type.isAssignableFrom(short[].class) ? ARG_SHORT : 0;
-
-                        // arrays whose length is used will have an int arg holding
-                        // the length as a kernel param
-                        if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].name)) {
-                           args[i].type |= ARG_ARRAYLENGTH;
+                     }
+   
+                     //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
+                     //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
+                     //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
+                     // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
+                     // jniFlags |= (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
+                     // Init the device to check capabilities before emitting the
+                     // code that requires the capabilities.
+   
+                     // synchronized(Kernel.class){
+                     jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
+                  } // end of synchronized! issue 68
+                  
+                  if (jniContextHandle == 0) {
+                     return warnFallBackAndExecute(_entrypointName, _range, _passes, "initJNI failed to return a valid handle");
+                  }
+   
+                  String extensions = getExtensionsJNI(jniContextHandle);
+                  capabilitiesSet = new HashSet<String>();
+                  
+                  StringTokenizer strTok = new StringTokenizer(extensions);
+                  while (strTok.hasMoreTokens()) {
+                     capabilitiesSet.add(strTok.nextToken());
+                  }
+                  
+                  if (logger.isLoggable(Level.FINE)) {
+                     logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
+                  }
+   
+                  if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
+                     return warnFallBackAndExecute(_entrypointName, _range, _passes, "FP64 required but not supported");
+                  }
+   
+                  if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
+                     return warnFallBackAndExecute(_entrypointName, _range, _passes,
+                           "Byte addressable stores required but not supported");
+                  }
+   
+                  boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport() && hasGlobalInt32ExtendedAtomicsSupport()
+                        && hasLocalInt32BaseAtomicsSupport() && hasLocalInt32ExtendedAtomicsSupport();
+   
+                  if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
+   
+                     return warnFallBackAndExecute(_entrypointName, _range, _passes, "32 bit Atomics required but not supported");
+                  }
+   
+                  String openCL = null;
+                  try {
+                     openCL = KernelWriter.writeToString(entryPoint);
+                  } catch (CodeGenException codeGenException) {
+                     return warnFallBackAndExecute(_entrypointName, _range, _passes, codeGenException);
+                  }
+   
+                  if (Config.enableShowGeneratedOpenCL) {
+                     System.out.println(openCL);
+                  }
+                  
+                  if (logger.isLoggable(Level.INFO)) {
+                     logger.info(openCL);
+                  }
+   
+                  // Send the string to OpenCL to compile it
+                  if (buildProgramJNI(jniContextHandle, openCL) == 0) {
+                     return warnFallBackAndExecute(_entrypointName, _range, _passes, "OpenCL compile failed");
+                  }
+   
+                  args = new KernelArg[entryPoint.getReferencedFields().size()];
+                  int i = 0;
+   
+                  for (Field field : entryPoint.getReferencedFields()) {
+                     try {
+                        field.setAccessible(true);
+                        args[i] = new KernelArg();
+                        args[i].name = field.getName();
+                        args[i].field = field;
+                        if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
+                           args[i].type |= ARG_STATIC;
                         }
-                        if (type.getName().startsWith("[L")) {
-                           args[i].type |= (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ);
-                           if (logger.isLoggable(Level.FINE)) {
-                              logger.fine("tagging " + args[i].name + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
+   
+                        Class<?> type = field.getType();
+                        if (type.isArray()) {
+                           if (field.getAnnotation(com.amd.aparapi.Kernel.Local.class) != null
+                                 || args[i].name.endsWith(Kernel.LOCAL_SUFFIX)) {
+                              args[i].type |= ARG_LOCAL;
+                           } else if (field.getAnnotation(com.amd.aparapi.Kernel.Constant.class) != null
+                                 || args[i].name.endsWith(Kernel.CONSTANT_SUFFIX)) {
+                              args[i].type |= ARG_CONSTANT;
+                           } else {
+                              args[i].type |= ARG_GLOBAL;
+                           }
+                           
+                           args[i].array = null; // will get updated in updateKernelArrayRefs
+                           args[i].type |= ARG_ARRAY;
+                           
+                           if (isExplicit()) {
+                              args[i].type |= ARG_EXPLICIT;
+                           }
+                           
+                           // for now, treat all write arrays as read-write, see bugzilla issue 4859
+                           // we might come up with a better solution later
+                           args[i].type |= entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ)
+                                 : 0;
+                           args[i].type |= entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0;
+                           // args[i].type |= ARG_GLOBAL;
+                           args[i].type |= type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0;
+   
+                           args[i].type |= type.isAssignableFrom(int[].class) ? ARG_INT : 0;
+   
+                           args[i].type |= type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0;
+   
+                           args[i].type |= type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0;
+   
+                           args[i].type |= type.isAssignableFrom(char[].class) ? ARG_CHAR : 0;
+   
+                           args[i].type |= type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0;
+   
+                           args[i].type |= type.isAssignableFrom(long[].class) ? ARG_LONG : 0;
+   
+                           args[i].type |= type.isAssignableFrom(short[].class) ? ARG_SHORT : 0;
+   
+                           // arrays whose length is used will have an int arg holding
+                           // the length as a kernel param
+                           if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].name)) {
+                              args[i].type |= ARG_ARRAYLENGTH;
                            }
+                           
+                           if (type.getName().startsWith("[L")) {
+                              args[i].type |= (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ);
+                              if (logger.isLoggable(Level.FINE)) {
+                                 logger.fine("tagging " + args[i].name + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
+                              }
+                           }
+                        } else if (type.isAssignableFrom(float.class)) {
+                           args[i].type |= ARG_PRIMITIVE;
+                           args[i].type |= ARG_FLOAT;
+                        } else if (type.isAssignableFrom(int.class)) {
+                           args[i].type |= ARG_PRIMITIVE;
+                           args[i].type |= ARG_INT;
+                        } else if (type.isAssignableFrom(double.class)) {
+                           args[i].type |= ARG_PRIMITIVE;
+                           args[i].type |= ARG_DOUBLE;
+                        } else if (type.isAssignableFrom(long.class)) {
+                           args[i].type |= ARG_PRIMITIVE;
+                           args[i].type |= ARG_LONG;
+                        } else if (type.isAssignableFrom(boolean.class)) {
+                           args[i].type |= ARG_PRIMITIVE;
+                           args[i].type |= ARG_BOOLEAN;
+                        } else if (type.isAssignableFrom(byte.class)) {
+                           args[i].type |= ARG_PRIMITIVE;
+                           args[i].type |= ARG_BYTE;
+                        } else if (type.isAssignableFrom(char.class)) {
+                           args[i].type |= ARG_PRIMITIVE;
+                           args[i].type |= ARG_CHAR;
+                        } else if (type.isAssignableFrom(short.class)) {
+                           args[i].type |= ARG_PRIMITIVE;
+                           args[i].type |= ARG_SHORT;
                         }
-
-                     } else if (type.isAssignableFrom(float.class)) {
-                        args[i].type |= ARG_PRIMITIVE;
-                        args[i].type |= ARG_FLOAT;
-                     } else if (type.isAssignableFrom(int.class)) {
-                        args[i].type |= ARG_PRIMITIVE;
-                        args[i].type |= ARG_INT;
-                     } else if (type.isAssignableFrom(double.class)) {
-                        args[i].type |= ARG_PRIMITIVE;
-                        args[i].type |= ARG_DOUBLE;
-                     } else if (type.isAssignableFrom(long.class)) {
-                        args[i].type |= ARG_PRIMITIVE;
-                        args[i].type |= ARG_LONG;
-                     } else if (type.isAssignableFrom(boolean.class)) {
-                        args[i].type |= ARG_PRIMITIVE;
-                        args[i].type |= ARG_BOOLEAN;
-                     } else if (type.isAssignableFrom(byte.class)) {
-                        args[i].type |= ARG_PRIMITIVE;
-                        args[i].type |= ARG_BYTE;
-                     } else if (type.isAssignableFrom(char.class)) {
-                        args[i].type |= ARG_PRIMITIVE;
-                        args[i].type |= ARG_CHAR;
-                     } else if (type.isAssignableFrom(short.class)) {
-                        args[i].type |= ARG_PRIMITIVE;
-                        args[i].type |= ARG_SHORT;
+                        // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type );
+                     } catch (IllegalArgumentException e) {
+                        e.printStackTrace();
+                     }
+   
+                     args[i].primitiveSize = ((args[i].type & ARG_FLOAT) != 0 ? 4 : (args[i].type & ARG_INT) != 0 ? 4
+                           : (args[i].type & ARG_BYTE) != 0 ? 1 : (args[i].type & ARG_CHAR) != 0 ? 2
+                                 : (args[i].type & ARG_BOOLEAN) != 0 ? 1 : (args[i].type & ARG_SHORT) != 0 ? 2
+                                       : (args[i].type & ARG_LONG) != 0 ? 8 : (args[i].type & ARG_DOUBLE) != 0 ? 8 : 0);
+   
+                     if (logger.isLoggable(Level.FINE)) {
+                        logger.fine("arg " + i + ", " + args[i].name + ", type=" + Integer.toHexString(args[i].type)
+                              + ", primitiveSize=" + args[i].primitiveSize);
                      }
-                     // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type );
-                  } catch (IllegalArgumentException e) {
-                     e.printStackTrace();
+   
+                     i++;
                   }
-
-                  args[i].primitiveSize = ((args[i].type & ARG_FLOAT) != 0 ? 4 : (args[i].type & ARG_INT) != 0 ? 4
-                        : (args[i].type & ARG_BYTE) != 0 ? 1 : (args[i].type & ARG_CHAR) != 0 ? 2
-                              : (args[i].type & ARG_BOOLEAN) != 0 ? 1 : (args[i].type & ARG_SHORT) != 0 ? 2
-                                    : (args[i].type & ARG_LONG) != 0 ? 8 : (args[i].type & ARG_DOUBLE) != 0 ? 8 : 0);
-
-                  if (logger.isLoggable(Level.FINE)) {
-                     logger.fine("arg " + i + ", " + args[i].name + ", type=" + Integer.toHexString(args[i].type)
-                           + ", primitiveSize=" + args[i].primitiveSize);
+   
+                  // at this point, i = the actual used number of arguments
+                  // (private buffers do not get treated as arguments)
+   
+                  argc = i;
+   
+                  setArgsJNI(jniContextHandle, args, argc);
+   
+                  conversionTime = System.currentTimeMillis() - executeStartTime;
+   
+                  try {
+                     executeOpenCL(_entrypointName, _range, _passes);
+                  } catch (AparapiException e) {
+                     warnFallBackAndExecute(_entrypointName, _range, _passes, e);
                   }
-
-                  i++;
-               }
-
-               // at this point, i = the actual used number of arguments
-               // (private buffers do not get treated as arguments)
-
-               argc = i;
-
-               setArgsJNI(jniContextHandle, args, argc);
-
-               conversionTime = System.currentTimeMillis() - executeStartTime;
-
-               try {
-                  executeOpenCL(_entrypointName, _range, _passes);
-               } catch (AparapiException e) {
-                  warnFallBackAndExecute(_entrypointName, _range, _passes, e);
                }
             } else {
                warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to locate entrypoint");
             }
-
          } else {
-
             try {
                executeOpenCL(_entrypointName, _range, _passes);
             } catch (AparapiException e) {
-
                warnFallBackAndExecute(_entrypointName, _range, _passes, e);
             }
          }
-
       } else {
          executeJava(_range, _passes);
       }
+      
       if (Config.enableExecutionModeReporting) {
          System.out.println(kernel.getClass().getCanonicalName() + ":" + kernel.getExecutionMode());
       }
+      
       executionTime = System.currentTimeMillis() - executeStartTime;
       accumulatedExecutionTime += executionTime;