SIMD-Aware OpenMP Dispatch
This module provides vectorized loop iteration for OpenMP backend with SIMD. It supports the outer-inner loop pattern where:
- Outer loop iterates over vector groups (OpenMP parallelized)
- Inner "loop" processes SIMD lanes together
Usage: for outer in eachOuter(layout): # Process all SIMD lanes for this outer index let simdVec = loadSimdVector(data, outer, layout) let result = simdVec * 2.0 storeSimdVector(result, data, outer, layout)
Procs
proc loadSimdVector[N: static[int]; T](data: ptr UncheckedArray[T]; outerIdx: int; elemIdx: int; elemsPerSite: int; nSitesInner: int): SimdVec[ N, T] {.inline.}
-
Load a SIMD vector from AoSoA layout
Loads all nSitesInner values for a given (outerIdx, elemIdx) pair. In AoSoA layout, these values are contiguous in memory.
Parameters: data: Pointer to AoSoA data array outerIdx: Vector group index elemIdx: Element index within tensor (0 to elemsPerSite-1) elemsPerSite: Number of elements per site nSitesInner: Number of SIMD lanes (must equal N)
proc loadSimdVectorDyn[T](data: ptr UncheckedArray[T]; outerIdx: int; elemIdx: int; elemsPerSite: int; nSitesInner: int): SimdVecDyn[ T] {.inline.}
- Load a dynamic-width SIMD vector from AoSoA layout
proc loadTensorSimd[T](data: ptr UncheckedArray[T]; outerIdx: int; elemsPerSite: int; nSitesInner: int): seq[SimdVecDyn[T]] {. inline.}
-
Load all tensor elements for a vector group as SIMD vectors
Returns a sequence of SIMD vectors, one per tensor element. resultelemIdx contains values for all SIMD lanes of that element.
proc storeSimdVector[N: static[int]; T](vec: SimdVec[N, T]; data: ptr UncheckedArray[T]; outerIdx: int; elemIdx: int; elemsPerSite: int; nSitesInner: int) {. inline.}
-
Store a SIMD vector to AoSoA layout
Stores all nSitesInner values for a given (outerIdx, elemIdx) pair.
proc storeSimdVectorDyn[T](vec: SimdVecDyn[T]; data: ptr UncheckedArray[T]; outerIdx: int; elemIdx: int; elemsPerSite: int; nSitesInner: int) {.inline.}
- Store a dynamic-width SIMD vector to AoSoA layout
proc storeTensorSimd[T](tensors: seq[SimdVecDyn[T]]; data: ptr UncheckedArray[T]; outerIdx: int; elemsPerSite: int; nSitesInner: int) {.inline.}
- Store all tensor elements from SIMD vectors
proc transformAoSoAtoAoSSimd[T](src: pointer; layout: SimdLatticeLayout; elemsPerSite: int): seq[T]
- Transform data from AoSoA back to AoS layout using SIMD layout
proc transformAoStoAoSoASimd[T](src: pointer; layout: SimdLatticeLayout; elemsPerSite: int): seq[T]
-
Transform data from AoS to AoSoA layout using SIMD layout
Input AoS: site0e0,e1,..., site1e0,e1,..., ... Output AoSoA: outer0e0: lane0..laneN, e1: lane0..laneN, ..., outer1...
Macros
macro eachOuter(forLoop: ForLoopStmt): untyped
-
SIMD-aware outer loop iterator
Iterates over vector groups (outer indices) with OpenMP parallelization. Each iteration processes nSitesInner sites via SIMD vectorization.
Usage: for outer in eachOuter(layout.nSitesOuter): for lane in 0..<layout.nSitesInner: let site = layout.outerInnerToLocal(outer, lane) # Process site...
Or more efficiently with SIMD loads: for outer in eachOuter(layout.nSitesOuter): let vec = loadSimdVector(data, outer, elemsPerSite, layout) # Process vec...
Templates
template forEachSimd(layout: SimdLatticeLayout; body: untyped)
-
High-level SIMD iteration template
Iterates over all vector groups with OpenMP parallelization. Within each iteration, the outerIdx variable is available.
Example: forEachSimd(layout): for e in 0..<elemsPerSite: let vec = loadSimdVectorDyn(data, outerIdx, e, elemsPerSite, layout.nSitesInner) let result = 2.0 * vec storeSimdVectorDyn(result, data, outerIdx, e, elemsPerSite, layout.nSitesInner)
Exports
-
getNumThreads, omp_get_max_threads, getThreadId, initOpenMP, ompParallel, omp_get_thread_num, SimdLatticeLayout, $, simdLanes, lexicographicToCoords, outerInnerToLocal, computeStrides, newSimdLatticeLayout, localToOuterInner, coordsToLexicographic, newSimdLatticeLayout, vectorGroups, validateSimdGrid, computeLocalGeom, generateCoordTable, aosoaIndexFromLocal, aosoaIndex, computeProduct, mmask64, *, SimdF32x4, m64, SimdI64x4, product, m512, +, m512d, SimdF64x8, SimdF32x8, mmask8, +, splat, *, m128h, m256h, /, SimdF64x2, /, $, $, loadStrided, m512h, +=, -, -, -, *=, zero, +, -=, *, +, m256i, SimdVecDyn, +=, +, m128d, /, SimdI32x4, max, /=, []=, m256d, -, sum, /, SimdVec, SimdI32x8, m128, store, -, SimdF64x4, /=, *=, storeStrided, SimdI64x2, mmask32, mmask16, m128i, newSimdVecDyn, sum, SimdI64x8, len, *, *, +, -, newSimdVecDyn, -=, min, m256, load, m512i, SimdF32x16, *, SimdI32x16, []