OpenCL Dispatch Module for TensorFieldView
The each macro transforms for loops over TensorFieldView into OpenCL kernels. It handles:
- AoSoA memory layout for coalesced GPU access
- Matrix/vector operations (matmul, matadd, matvec, scalar ops)
- Expression trees with operation type inference
- Multi-device dispatch
- Automatic type detection and OpenCL code generation
Types
ElementType = enum etFloat32, etFloat64, etInt32, etInt64
- Element type for kernel code generation
Consts
DebugKernels {.booldefine.} = false
UseWorkGroups {.booldefine.} = false
VectorWidth {.intdefine.} = 8
Exports
-
MEM_READ_ONLY, enqueueReadBuffer, buildProgram, initCL, COMMAND_FILL_IMAGE, DEVICE_PARTITION_MAX_SUB_DEVICES, buildErrors, PROGRAM_BINARY_TYPE, COMMAND_MARKER, COMMAND_COPY_BUFFER, createCommandQueue, DEVICE_IMAGE_SUPPORT, DEVICE_VENDOR_ID, KERNEL_ARG_ADDRESS_CONSTANT, Tdevice_local_mem_type, DEVICE_BUILT_IN_KERNELS, createProgramWithBuiltInKernels, KERNEL_ARG_ADDRESS_GLOBAL, Tkernel_arg_address_qualifier, createSubDevices, enqueueFillImage, READ_WRITE_CACHE, NONE, PROGRAM_BUILD_OPTIONS, DEVICE_OPENCL_C_VERSION, DEVICE_PREFERRED_VECTOR_WIDTH_INT, releaseSampler, release, fmod, KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, name, QUEUE_REFERENCE_COUNT, PROGRAM_BINARY_TYPE_LIBRARY, FILTER_NEAREST, GLOBAL, native_log10, DEVICE_ERROR_CORRECTION_SUPPORT, write_mem_fence_impl, KERNEL_ARG_TYPE_RESTRICT, PROGRAM_DEVICES, createContextFromType, fabs, EVENT_COMMAND_QUEUE, log10, INTENSITY, EVENT_COMMAND_TYPE, DEVICE_GLOBAL_MEM_CACHE_SIZE, read, CL_UNORM_INT8, Tmem_object_type, PLATFORM_VENDOR, DEVICE_IMAGE_MAX_BUFFER_SIZE, COMMAND_ACQUIRE_GL_OBJECTS, atomic_inc_impl, exp, DEVICE_PARTITION_BY_COUNTS, createProgram, EVENT_COMMAND_EXECUTION_STATUS, FP_DENORM, CL_TRUE, KERNEL_ARG_ADDRESS_LOCAL, Tdevice_exec_capabilities, VERSION_1_0, KERNEL_ARG_TYPE_NONE, KERNEL_PRIVATE_MEM_SIZE, CL_SIGNED_INT32, setArg, PLATFORM_NAME, COMPLETE, MEM_HOST_PTR, DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, sinh, mem_fence, DEVICE_QUEUE_PROPERTIES, enqueueMigrateMemObjects, Tbuffer_region, MEM_READ_WRITE, DEVICE_AVAILABLE, DEVICE_SINGLE_FP_CONFIG, Tmem_info, MEM_OBJECT_IMAGE1D, get_local_size, KERNEL_NUM_ARGS, cosh, KERNEL_CONTEXT, PROGRAM_BINARY_TYPE_EXECUTABLE, DEVICE_MAX_COMPUTE_UNITS, native_sin_impl, DEVICE_TYPE_DEFAULT, PROGRAM_NUM_DEVICES, DEVICE_PARTITION_TYPE, getEventInfo, createBuffer, device, barrier_impl, CL_UNORM_INT16, atomic_or_impl, enqueueTask, native_tan, native_exp_impl, enqueueReadBufferRect, IMAGE_NUM_SAMPLES, release, enqueueBarrierWithWaitList, log, getExtensionFunctionAddressForPlatform, retainKernel, IMAGE_SLICE_PITCH, getCommandQueueInfo, retainMemObject, MEM_OBJECT_IMAGE1D_BUFFER, atomic_sub, getKernelArgInfo, TCreateContextCb, sin, native_sqrt_impl, MEM_HOST_NO_ACCESS, DEVICE_IMAGE2D_MAX_WIDTH, atan2, QUEUE_PROPERTIES, DEPTH, PROGRAM_KERNEL_NAMES, createUserEvent, run3d, DEVICE_AFFINITY_DOMAIN_L2_CACHE, CONTEXT_DEVICES, CONTEXT_INTEROP_USER_SYNC, getKernelInfo, CONTEXT_PROPERTIES, Pprogram, setKernelArg, native_exp2_impl, SAMPLER_ADDRESSING_MODE, CL_A, DEVICE_PROFILE, native_log_impl, DEVICE_MAX_CLOCK_FREQUENCY, enqueueWriteBufferRect, MEM_HOST_WRITE_ONLY, CL_SIGNED_INT16, CL_UNSIGNED_INT8, native_sqrt, COMMAND_UNMAP_MEM_OBJECT, clamp, [], Pplatform_id, Tplatform_info, enqueueReadImage, SAMPLER_NORMALIZED_COORDS, Taddressing_mode, args, DEVICE_MAX_CONSTANT_ARGS, pow, MAP_READ, [], native_sin, PROGRAM_BINARY_TYPE_COMPILED_OBJECT, MEM_HOST_READ_ONLY, EVENT_CONTEXT, KERNEL_GLOBAL_WORK_SIZE, write_mem_fence, compileProgram, COMMAND_MIGRATE_MEM_OBJECTS, COMMAND_WRITE_BUFFER, createContext, KERNEL_ARG_ADDRESS_QUALIFIER, Timage_format, name, SAMPLER_REFERENCE_COUNT, Tcommand_type, DEVICE_IMAGE3D_MAX_HEIGHT, DEVICE_IMAGE3D_MAX_DEPTH, singleDeviceDefaults, unloadCompiler, PROGRAM_NUM_KERNELS, atomic_or, atomic_min, gpuBufferLike, Tdevice_mem_cache_type, openclDefaults, native_powr_impl, native_tan_impl, get_global_id, DEVICE_MAX_WORK_ITEM_DIMENSIONS, COMMAND_RELEASE_GL_OBJECTS, Pevent, COMMAND_MAP_BUFFER, getProgramBuildInfo, DEVICE_TYPE_CUSTOM, enqueueCopyBufferRect, createAndBuild, setArg, retainProgram, DEVICE_PLATFORM, ADDRESS_MIRRORED_REPEAT, releaseKernel, unloadPlatformCompiler, createAndBuildBinary, native_cos_impl, get_num_groups, Tkernel_arg_access_qualifier, CLK_GLOBAL_MEM_FENCE, IMAGE_ELEMENT_SIZE, waitForEvents, native_exp2, MEM_OBJECT_IMAGE2D, enqueueUnmapMemObject, DEVICE_REFERENCE_COUNT, DEVICE_MEM_BASE_ADDR_ALIGN, enqueueCopyBuffer, MEM_ALLOC_HOST_PTR, CL_RA, getDevices, CL_UNORM_INT24, Timage_info, getEventProfilingInfo, CL_RG, DEVICE_IMAGE_PITCH_ALIGNMENT, atomic_max, PLATFORM_EXTENSIONS, retainDevice, Tprogram_build_info, rsqrt, FP_ROUND_TO_INF, ADDRESS_CLAMP, PROGRAM_CONTEXT, finish, CL_SNORM_INT8, firstPlatform, createProgramWithBinary, MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED, DEPTH_STENCIL, Tchannel_type, DEVICE_AFFINITY_DOMAIN_L3_CACHE, KERNEL_REFERENCE_COUNT, Dim, DEVICE_MAX_WORK_ITEM_SIZES, atomic_dec_impl, CL_RGx, releaseContext, DEVICE_MAX_CONSTANT_BUFFER_SIZE, atomic_xor, get_global_size, DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, get_group_id, CL_R, maxWorkItems, DEVICE_PARENT_DEVICE, ADDRESS_REPEAT, getPlatformInfo, multipleDeviceDefaults, DEVICE_EXTENSIONS, Tcommand_queue_info, DEVICE_MAX_READ_IMAGE_ARGS, QUEUE_DEVICE, enqueueMapBuffer, DEVICE_EXECUTION_CAPABILITIES, DEVICE_TYPE_CPU, DEVICE_NATIVE_VECTOR_WIDTH_INT, CL_UNSIGNED_INT16, DEVICE_PARTITION_BY_AFFINITY_DOMAIN, TDeviceType, QUEUE_PROFILING_ENABLE, createProgramWithSource, EXEC_KERNEL, DEVICE_VERSION, enqueueWriteBuffer, enqueueNDRangeKernel, DEVICE_GLOBAL_MEM_CACHE_TYPE, DEVICE_ENDIAN_LITTLE, setArg, FP_ROUND_TO_ZERO, round, ADDRESS_NONE, PROGRAM_BUILD_STATUS, trunc, COMMAND_TASK, COMMAND_READ_BUFFER_RECT, TProgramCb, CL_UNORM_INT_101010, COMMAND_NDRANGE_KERNEL, ADDRESS_CLAMP_TO_EDGE, get_local_id, CL_UNORM_SHORT_555, KERNEL_COMPILE_WORK_GROUP_SIZE, createKernelsInProgram, getSupportedImageFormats, get_work_dim, COMMAND_READ_BUFFER, setEventCallback, DEVICE_NATIVE_VECTOR_WIDTH_SHORT, DEVICE_PARTITION_BY_COUNTS_LIST_END, DEVICE_PREFERRED_VECTOR_WIDTH_LONG, Tkernel_work_group_info, DEVICE_AFFINITY_DOMAIN_L4_CACHE, Tcontext_properties, ceil, read_mem_fence_impl, COMMAND_COPY_BUFFER_TO_IMAGE, COMMAND_BARRIER, releaseEvent, Tcontext_info, buildOn, buffer, native_recip_impl, get_global_offset, finalizeCL, Tfilter_mode, DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, native_rsqrt, releaseProgram, MEM_USE_HOST_PTR, KERNEL_ARG_TYPE_VOLATILE, releaseDevice, MEM_CONTEXT, Tkernel_arg_type_qualifier, PROGRAM_REFERENCE_COUNT, IMAGE_HEIGHT, MAP_WRITE, global, DEVICE_IMAGE2D_MAX_HEIGHT, atomic_inc, DEVICE_NAME, Tprogram_binary_type, IMAGE_FORMAT, run2d, write, TMemObjectDestructorCb, BLOCKING, NON_BLOCKING, KERNEL_ARG_ACCESS_READ_ONLY, createAndBuild, Tmem_flags, DEVICE_IMAGE3D_MAX_WIDTH, release, Tbuild_status, setArg, native_log2_impl, createKernel, retainEvent, enqueueWriteImage, IMAGE_WIDTH, PLATFORM_PROFILE, localMemory, TClResult, DEVICE_VENDOR, Tkernel_info, tan, check, CL_BGRA, fmin, FP_ROUND_TO_NEAREST, getContextInfo, PLATFORM_VERSION, CL_SIGNED_INT8, DEVICE_MAX_MEM_ALLOC_SIZE, CL_RGBx, retainCommandQueue, KERNEL_ARG_ADDRESS_PRIVATE, MEM_SIZE, MEM_WRITE_ONLY, CL_HALF_FLOAT, RUNNING, BUILD_SUCCESS, releaseMemObject, getProgramInfo, gpuBuffer, MEM_MAP_COUNT, Timage_desc, DEVICE_LOCAL_MEM_SIZE, KERNEL_ARG_ACCESS_WRITE_ONLY, IMAGE_ARRAY_SIZE, setUserEventStatus, CONTEXT_PLATFORM, KERNEL_ARG_TYPE_QUALIFIER, bufferLike, KERNEL_WORK_GROUP_SIZE, tanh, PROGRAM_BUILD_LOG, log2, linkProgram, acos, DEVICE_GLOBAL_MEM_SIZE, atomic_dec, Tchannel_order, atomic_max_impl, DEVICE_ADDRESS_BITS, Tdevice_partition_property, read_mem_fence, Tcommand_queue_properties, createSampler, createContext, IMAGE_NUM_MIP_LEVELS, raiseEOpenCL, enqueueCopyBufferToImage, atomic_cmpxchg, globalMemory, flush, DEVICE_TYPE_GPU, DEVICE_AFFINITY_DOMAIN_L1_CACHE, KERNEL_ARG_ACCESS_READ_WRITE, QUEUED, DEVICE_NATIVE_VECTOR_WIDTH_LONG, Pkernel, COMMAND_WRITE_BUFFER_RECT, createImage2D, FP_SOFT_FLOAT, Tprogram_info, VERSION_1_2, MEM_OBJECT_BUFFER, DEVICE_PARTITION_PROPERTIES, MEM_ASSOCIATED_MEMOBJECT, EVENT_REFERENCE_COUNT, createImage, LocalBuffer, CL_FLOAT, Tsampler_info, Tmem_migration_flags, atan, DEVICE_NATIVE_VECTOR_WIDTH_HALF, enqueueBarrier, PROGRAM_BINARY_SIZES, DEVICE_MAX_WRITE_IMAGE_ARGS, DEVICE_AFFINITY_DOMAIN_NUMA, Tbool, createKernel, PROGRAM_BINARIES, DEVICE_COMPILER_AVAILABLE, asin, Tprofiling_info, CL_UNORM_SHORT_565, Tkernel_arg_info, createSubBuffer, IMAGE_DEPTH, CL_UNSIGNED_INT32, atomic_and, atomic_sub_impl, SAMPLER_FILTER_MODE, IMAGE_ROW_PITCH, COMMAND_FILL_BUFFER, read, QUEUE_CONTEXT, COMMAND_COPY_BUFFER_RECT, rsqrt_impl, clamp_impl, Psampler, KERNEL_ARG_TYPE_CONST, run2d, DEVICE_PARTITION_AFFINITY_DOMAIN, read, DEVICE_PROFILING_TIMER_RESOLUTION, sqrt, Tdevice_info, MAP_WRITE_INVALIDATE_REGION, atomic_add, write, native_recip, getPlatformByName, EOpenCL, VERSION_1_1, DEVICE_PREFERRED_VECTOR_WIDTH_HALF, DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, mem_fence_impl, run, QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, CONTEXT_REFERENCE_COUNT, FILTER_LINEAR, Tmap_flags, constant, MEM_COPY_HOST_PTR, DEVICE_IMAGE_MAX_ARRAY_SIZE, createImage3D, enqueueMarkerWithWaitList, retainContext, CL_RGBA, DEVICE_GLOBAL_MEM_CACHELINE_SIZE, COMMAND_READ_IMAGE, setMemObjectDestructorCallback, getPlatformIDs, kernel, floor, LUMINANCE, Tbitfield, BUILD_ERROR, MEM_OBJECT_IMAGE3D, COMMAND_WRITE_IMAGE, getDeviceInfo, DEVICE_TYPE_ACCELERATOR, DEVICE_NATIVE_VECTOR_WIDTH_CHAR, DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, atomic_xchg, release, atomic_and_impl, releaseCommandQueue, enqueueMapImage, DEVICE_LINKER_AVAILABLE, enqueueWaitForEvents, native_log, DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, COMMAND_COPY_IMAGE_TO_BUFFER, version, run, DEVICE_TYPE_ALL, DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, KERNEL_FUNCTION_NAME, commandQueueFor, TEventCb, COMMAND_USER, getDeviceIDs, Pcommand_queue, atomic_xor_impl, BUILD_IN_PROGRESS, createProgramBinary, CL_Rx, KERNEL_ARG_ACCESS_QUALIFIER, DEVICE_LOCAL_MEM_TYPE, native_log10_impl, EXEC_NATIVE_KERNEL, enqueueNativeKernel, KERNEL_ARG_ACCESS_NONE, enqueueCopyImageToBuffer, maxWorkGroups, []=, setArg, Pdevice_id, GpuBuffer, atomic_xchg_impl, []=, atomic_cmpxchg_impl, enqueueMarker, native_powr, MEM_OFFSET, DEVICE_PRINTF_BUFFER_SIZE, buildOn, DEVICE_MAX_PARAMETER_SIZE, retainSampler, write, CL_ARGB, CLK_LOCAL_MEM_FENCE, COMMAND_COPY_IMAGE, getExtensionFunctionAddress, fma, PROGRAM_BINARY_TYPE_NONE, MEM_REFERENCE_COUNT, fmax, DEVICE_PREFERRED_INTEROP_USER_SYNC, FP_INF_NAN, MEM_OBJECT_IMAGE2D_ARRAY, KERNEL_ATTRIBUTES, DEVICE_HOST_UNIFIED_MEMORY, release, Pmem, FP_FMA, CL_FALSE, MEM_TYPE, native_log2, barrier, FP_CORRECTLY_ROUNDED_DIVIDE_SQRT, COMMAND_NATIVE_KERNEL, DEVICE_MAX_WORK_GROUP_SIZE, DEVICE_TYPE, atomic_add_impl, MEM_FLAGS, oclName, exp2, native_rsqrt_impl, getMemObjectInfo, DRIVER_VERSION, native_exp, CL_SNORM_INT16, Tdevice_affinity_domain, Tevent_info, BUFFER_CREATE_TYPE_REGION, enqueueFillBuffer, Tbuffer_create_type, IMAGE_BUFFER, KERNEL_PROGRAM, DEVICE_MAX_SAMPLERS, atomic_min_impl, READ_ONLY_CACHE, TUserCb, COMMAND_MAP_IMAGE, CONTEXT_NUM_DEVICES, DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, Tdevice_fp_config, SAMPLER_CONTEXT, DEVICE_PARTITION_EQUALLY, getSamplerInfo, KERNEL_ARG_NAME, DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cos, enqueueCopyImage, PROGRAM_SOURCE, KERNEL_LOCAL_MEM_SIZE, release, SUBMITTED, DEVICE_DOUBLE_FP_CONFIG, MEM_OBJECT_IMAGE1D_ARRAY, run3d, BUILD_NONE, getImageInfo, local, native_cos, CL_RGB, getKernelWorkGroupInfo, Pcontext, KERNEL_ARG_TYPE_NAME, LOCAL, MIGRATE_MEM_OBJECT_HOST