diff --git a/CHANGELOG.md b/CHANGELOG.md index 4cdb588d5..faf4de42c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ## Unreleased (Planned Release Target: v0.2.1) ### List of Pull Requests +- Support Fully Asynchronous DMAs [#114](https://github.com/pulp-platform/Deeploy/pull/114) - Disallow shape inference [#128](https://github.com/pulp-platform/Deeploy/pull/128) - Remove memory-aware node bindings [#123](https://github.com/pulp-platform/Deeploy/pull/123) - Fix missing const's layout transformation and refactor NCHWtoNHWC passes [#122](https://github.com/pulp-platform/Deeploy/pull/122) @@ -55,6 +56,8 @@ This file contains the changelog for the Deeploy project. The changelog is divid - RequantHelpers.py for Neureka's TileConstraints - Added assertion that all the graph tensors after lowering have a shape annotated - Added testFloatGEMMnobias +- Profiling support and optional comments in generated DMA code for better traceability +- Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy` ### Changed - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`. @@ -91,6 +94,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Removed Wmem variants of bindings and tile constraints from Neureka - Disabled ICCT_ITA_8 MemPool test because it was using a lowering that created shapeless tensors - Added missing shape annotation to the testTypeInferenceDifferentTypes +- Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode ### Fixed - Prevent node duplication for graphs generated via GraphSurgeon @@ -105,6 +109,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Missing layout transformation of the const's (bias, mul, add, shift in Conv/RequantizedConv) - Keep mul/add rank of requantized Neureka tile constraints - Fix bias hoisting in generic GEMM with no bias +- DMA synchronization bug causing reduced DB performance on memory-bound kernels. ### Removed - Delete outdated and unused `.gitlab-ci.yml` file diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py index 7f503a5da..3c0bba310 100644 --- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py +++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py @@ -7,14 +7,12 @@ from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity from Deeploy.TilingExtension.AsyncDma import AsyncDma from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \ - DoubleBufferingTilingCodeGeneration + DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \ - SingleBufferingTilingCodeGeneration -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import DoubleBufferingTilingMixIn, \ - ProfilingDoubleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingMixIn + ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration -class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, SingleBufferingTilingMixIn): +class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration): pass @@ -22,7 +20,7 @@ class ProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration pass -class PULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, DoubleBufferingTilingMixIn): +class PULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration): pass diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py index 646f179d9..9df0d8847 100644 --- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py +++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py @@ -7,14 +7,12 @@ from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity from Deeploy.TilingExtension.AsyncDma import AsyncDma from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \ - DoubleBufferingTilingCodeGeneration + DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \ - SingleBufferingTilingCodeGeneration -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import DoubleBufferingTilingMixIn, \ - ProfilingDoubleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingMixIn + ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration -class PULPL3TilingGenerationSB(SingleBufferingTilingCodeGeneration, SingleBufferingTilingMixIn): +class PULPL3TilingGenerationSB(SingleBufferingTilingCodeGeneration): pass @@ -22,7 +20,7 @@ class ProfilingPULPL3TilingGenerationSB(SingleBufferingTilingCodeGeneration, Pro pass -class PULPL3TilingGenerationDB(DoubleBufferingTilingCodeGeneration, DoubleBufferingTilingMixIn): +class PULPL3TilingGenerationDB(DoubleBufferingTilingCodeGeneration): pass diff --git a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py index 849db0857..6c2aa3081 100644 --- a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py +++ b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py @@ -12,9 +12,16 @@ class L3DmaFuture(Future): - _initTemplate = NodeTemplate("pi_cl_ram_req_t ${name};") + _initTemplate = NodeTemplate("pi_cl_ram_req_t ${name} = {0};") + _deinitTemplate = NodeTemplate("") - _waitTemplate = NodeTemplate("pi_cl_ram_copy_wait(&${name});") + + _allocTemplate = NodeTemplate("") + + _waitTemplate = NodeTemplate(""" + if (${name}.size != 0) { + pi_cl_ram_copy_wait(&${name}); + }""") class L3Dma(AsyncDma): diff --git a/Deeploy/Targets/PULPOpen/DMA/MchanDma.py b/Deeploy/Targets/PULPOpen/DMA/MchanDma.py index a64097367..93bf699dc 100644 --- a/Deeploy/Targets/PULPOpen/DMA/MchanDma.py +++ b/Deeploy/Targets/PULPOpen/DMA/MchanDma.py @@ -6,14 +6,23 @@ from typing import Dict, Tuple from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer -from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, TensorGroupWaitingStrategy +from Deeploy.TilingExtension.AsyncDma import AsyncDma, DirectionWaitingStrategy, DmaDirection, Future class MchanChannelFuture(Future): - _initTemplate = NodeTemplate("uint32_t ${name} = mchan_channel_alloc();") - _deinitTemplate = NodeTemplate("mchan_channel_free(${name});") - _waitTemplate = NodeTemplate("mchan_channel_wait(${name});") + _initTemplate = NodeTemplate("uint32_t ${name} = (uint32_t) -1;") + + _deinitTemplate = NodeTemplate("") + + _allocTemplate = NodeTemplate("${name} = mchan_channel_alloc();") + + _waitTemplate = NodeTemplate(""" +if (${name} <= MCHAN_CHANNEL_ID_MAX) { + mchan_channel_wait(${name}); + mchan_channel_free(${name}); +} +""") class MchanDma(AsyncDma): @@ -22,7 +31,7 @@ class MchanDma(AsyncDma): 1: NodeTemplate("mchan_transfer_1d(${cmd}, ${loc}, ${ext});"), 2: NodeTemplate("mchan_transfer_2d_ext_strided(${cmd}, ${loc}, ${ext}, ${size_1d}, ${stride_2d});"), } - _waitingStrategy = TensorGroupWaitingStrategy(MchanChannelFuture, "channel_id") + _waitingStrategy = DirectionWaitingStrategy(MchanChannelFuture, "channel") def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None: super().__init__(transferTemplates) diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py index e9be18a53..25b150b55 100644 --- a/Deeploy/Targets/Snitch/Bindings.py +++ b/Deeploy/Targets/Snitch/Bindings.py @@ -14,7 +14,7 @@ from Deeploy.Targets.Generic.Templates import iNoNormTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \ - SnitchProfileExecutionBlockPass, SnitchSynchCoresPass + SnitchSynchCoresPass from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template @@ -37,7 +37,6 @@ TiledTransformer = CodeTransformation([ SnitchCoreFilterPass("compute"), - SnitchProfileExecutionBlockPass(), TilingVariableReplacement("L1"), TilingCallClosure(writeback = False), SnitchSynchCoresPass(), diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py index 30213d314..e8204f6ae 100644 --- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py +++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py @@ -4,29 +4,45 @@ from typing import Tuple -from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity +from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \ + NodeTemplate, _NoVerbosity from Deeploy.TilingExtension.AsyncDma import AsyncDma from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \ - DoubleBufferingTilingCodeGeneration + DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \ - SingleBufferingTilingCodeGeneration -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import DoubleBufferingTilingMixIn, \ - SingleBufferingTilingMixIn + ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration -class SnitchClusterTilingSB(SingleBufferingTilingCodeGeneration, SingleBufferingTilingMixIn): +class SnitchClusterTilingSB(SingleBufferingTilingCodeGeneration): pass -class SnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration, DoubleBufferingTilingMixIn): +class SnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration): pass +class ProfilingSnitchClusterTilingSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn): + _printCycleDifference = NodeTemplate(r""" + printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ + ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + """) + + +class ProfilingSnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn): + _printCycleDifference = NodeTemplate(r""" + printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ + ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + """) + + class SnitchClusterTiling(CodeTransformationPass): def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma): self.SB = SnitchClusterTilingSB(externalMemory, localMemory, dma) + self.profilingSB = ProfilingSnitchClusterTilingSB(externalMemory, localMemory, dma) + self.DB = SnitchClusterTilingDB(externalMemory, localMemory, dma) + self.profilingDB = ProfilingSnitchClusterTilingDB(externalMemory, localMemory, dma) def apply(self, ctxt: NetworkContext, @@ -34,8 +50,9 @@ def apply(self, name: str, verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: if verbose.tilingProfiling: - raise NotImplementedError("Profiling not implemented for L2") - - ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name) - ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name) + ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name) + ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name) + else: + ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name) + ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name) return ctxt, executionBlock diff --git a/Deeploy/Targets/Snitch/DMA/SnitchDma.py b/Deeploy/Targets/Snitch/DMA/SnitchDma.py index aea1f0300..ac0c622cc 100644 --- a/Deeploy/Targets/Snitch/DMA/SnitchDma.py +++ b/Deeploy/Targets/Snitch/DMA/SnitchDma.py @@ -5,31 +5,41 @@ from typing import Dict, Tuple from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer -from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, TensorGroupWaitingStrategy +from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy class SnitchBarrierFuture(Future): _initTemplate = NodeTemplate("") _deinitTemplate = NodeTemplate("") + _allocTemplate = NodeTemplate("") _waitTemplate = NodeTemplate("if (snrt_is_dm_core()) snrt_dma_wait_all();") # LMACAN: TODO: Add single transfer waiting class SnitchFuture(Future): - _initTemplate = NodeTemplate("uint16_t ${name};") + _initTemplate = NodeTemplate("snrt_dma_txid_t ${name} = (snrt_dma_txid_t) -1;") + _deinitTemplate = NodeTemplate("") - _waitTemplate = NodeTemplate("if (snrt_is_dm_core()) snrt_dma_wait(${name});") + + _allocTemplate = NodeTemplate("") + + _waitTemplate = NodeTemplate( + "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait(${name});") class SnitchDma(AsyncDma): _transferTemplates = { 2: - NodeTemplate( - "if (snrt_is_dm_core()) snrt_dma_start_2d(${dest}, ${src}, ${size}, ${stride_dest}, ${stride_src}, ${repeat});" - ), + NodeTemplate(""" + if (snrt_is_dm_core()) { + ${future} = snrt_dma_start_2d(${dest}, ${src}, ${size}, ${stride_dest}, ${stride_src}, ${repeat}); + // WIESEP: Hack as otherwise the last commited DMA transaction ID can never be resolved. + snrt_dma_start_2d(${dest}, ${dest}, 1, 0, 0, 0); + } + """), } - _waitingStrategy = TensorGroupWaitingStrategy(SnitchBarrierFuture, "") + _waitingStrategy = PerTensorWaitingStrategy(SnitchFuture) def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None: super().__init__(transferTemplates) @@ -43,7 +53,6 @@ def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, lo def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection, future: Future) -> OperatorRepresentation: - _ = future operatorRepresentation: OperatorRepresentation = { "dest": localBuffer.name if direction == "ExternalToLocal" else externalBuffer.name, "src": externalBuffer.name if direction == "ExternalToLocal" else localBuffer.name, @@ -51,5 +60,6 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu "size": shape[1], "stride_dest": strideLoc[0] if direction == "ExternalToLocal" else strideExt[0], "stride_src": strideExt[0] if direction == "ExternalToLocal" else strideLoc[0], + "future": future.name } return operatorRepresentation diff --git a/Deeploy/TilingExtension/AsyncDma.py b/Deeploy/TilingExtension/AsyncDma.py index 63efbda17..a2b4efe45 100644 --- a/Deeploy/TilingExtension/AsyncDma.py +++ b/Deeploy/TilingExtension/AsyncDma.py @@ -16,6 +16,7 @@ class Future: _initTemplate: NodeTemplate + _allocTemplate: NodeTemplate _deinitTemplate: NodeTemplate _waitTemplate: NodeTemplate @@ -28,6 +29,9 @@ def _operatorRepresentation(self) -> OperatorRepresentation: def init(self) -> CodeSnippet: return CodeSnippet(self._initTemplate, self._operatorRepresentation()) + def alloc(self) -> CodeSnippet: + return CodeSnippet(self._allocTemplate, self._operatorRepresentation()) + def deinit(self) -> CodeSnippet: return CodeSnippet(self._deinitTemplate, self._operatorRepresentation()) @@ -41,25 +45,52 @@ def __init__(self, FutureCls: Type[Future]) -> None: self.FutureCls = FutureCls @abstractmethod - def getFuture(self, tensorName: str) -> Future: + def getFuture(self, tensorName: str, direction: DmaDirection) -> Future: pass class PerTensorWaitingStrategy(AsyncDmaWaitingStrategy): - def getFuture(self, tensorName: str) -> Future: - return self.FutureCls(tensorName + "_future") + def __init__(self, FutureCls: Type[Future]) -> None: + super().__init__(FutureCls) + # map (tensorName, direction) -> Future instance so the same Future + # object is returned for repeated requests for the same tensor/direction + self._futures: Dict[Tuple[str, DmaDirection], Future] = {} + + def getFuture(self, tensorName: str, direction: DmaDirection) -> Future: + key = (tensorName, direction) + if key not in self._futures: + # include direction in the future name to avoid accidental name + # collisions between directions for the same tensor + future_name = f"{tensorName}_{direction}" + self._futures[key] = self.FutureCls(future_name) + return self._futures[key] -class TensorGroupWaitingStrategy(AsyncDmaWaitingStrategy): +class DirectionWaitingStrategy(AsyncDmaWaitingStrategy): def __init__(self, FutureCls: Type[Future], asyncGroupName: str) -> None: super().__init__(FutureCls) - self.asyncGroupFuture = FutureCls(f"{asyncGroupName}_future") + self.asyncGroupName = asyncGroupName + self.asyncGroupFutures = { + "ExternalToLocal": FutureCls(asyncGroupName + "_input"), + "LocalToExternal": FutureCls(asyncGroupName + "_output") + } - def getFuture(self, tensorName: str) -> Future: + def getFuture(self, tensorName: str, direction: DmaDirection) -> Future: _ = tensorName - return self.asyncGroupFuture + return self.asyncGroupFutures[direction] + + +class BarrierWaitingStrategy(AsyncDmaWaitingStrategy): + + def __init__(self, FutureCls: Type[Future], barrierName: str) -> None: + super().__init__(FutureCls) + self.barrier = FutureCls(barrierName) + + def getFuture(self, tensorName: str, direction: DmaDirection) -> Future: + _ = tensorName, direction + return self.barrier class AsyncDma(ABC): @@ -69,8 +100,8 @@ class AsyncDma(ABC): def __init__(self, transferTemplates: Dict[int, NodeTemplate]) -> None: self._transferTemplates = transferTemplates - def getFuture(self, tensorName: str) -> Future: - return self._waitingStrategy.getFuture(tensorName) + def getFuture(self, tensorName: str, direction: DmaDirection) -> Future: + return self._waitingStrategy.getFuture(tensorName, direction) def supportedTransferRanks(self) -> Set[int]: return set(self._transferTemplates.keys()) @@ -98,16 +129,11 @@ def transfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBu template = self._transferTemplates[len(shape)] return [CodeSnippet(template, opRepr)] - def setup(self) -> List[CodeSnippet]: - return [] - - def teardown(self) -> List[CodeSnippet]: - return [] - class EmptyFuture(Future): _initTemplate = NodeTemplate("") + _allocTemplate = NodeTemplate("") _deinitTemplate = NodeTemplate("") _waitTemplate = NodeTemplate("") @@ -123,6 +149,9 @@ def __init__(self, dma: AsyncDma) -> None: def _transferTemplates(self) -> Dict[int, NodeTemplate]: return self.dma._transferTemplates + def getFuture(self, tensorName: str, direction: DmaDirection) -> Future: + return self.dma.getFuture(tensorName, direction) + def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection, future: Future) -> OperatorRepresentation: @@ -131,21 +160,13 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu def transfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection, future: Future) -> List[CodeSnippet]: - tmpFuture = self.dma.getFuture(future.name.removesuffix("_future")) callStack = [] - callStack.append(tmpFuture.init()) - callStack.extend( - self.dma.transfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction, tmpFuture)) - callStack.append(tmpFuture.wait()) - callStack.append(tmpFuture.deinit()) + dma_code = self.dma.transfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction, future) + callStack.append(future.alloc()) + callStack.extend(dma_code) + callStack.append(future.wait()) return callStack - def setup(self) -> List[CodeSnippet]: - return self.dma.setup() - - def teardown(self) -> List[CodeSnippet]: - return self.dma.teardown() - class AnydimAsyncDmaTransferAdapter: @@ -182,6 +203,9 @@ def __init__(self, name: str, depth: int): def __init__(self, dma: AsyncDma) -> None: self.dma = dma + def getFuture(self, tensorName: str, direction: DmaDirection) -> Future: + return self.dma.getFuture(tensorName, direction) + def nearestSupportedTransferRank(self, transfer_rank: int) -> int: sortedRanks = sorted(self.dma.supportedTransferRanks()) @@ -238,9 +262,10 @@ def transfer(self, "offset": "ext_offset" })) - callStack.extend( - self.dma.transfer(ctxt, externalBufferOffseted, localBufferOffseted, shape[-kernelRank:], - strideExt[-kernelRank:], strideLoc[-kernelRank:], direction, future)) + dma_code = self.dma.transfer(ctxt, externalBufferOffseted, localBufferOffseted, shape[-kernelRank:], + strideExt[-kernelRank:], strideLoc[-kernelRank:], direction, future) + + callStack.extend(dma_code) callStack.append(CodeSnippet(self.NestedForLoopCloseTemplate(nestedLoopDepth), {})) return callStack elif kernelRank == transferRank: diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py index 3d0e5b701..d436d1ccc 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import copy import math from typing import List, Set, Tuple @@ -12,7 +11,8 @@ from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import TilingMetaInfo +from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \ + PrototypeTilingMixIn, TilingMetaInfo from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape @@ -31,48 +31,89 @@ class DoubleBufferingTilingCodeGeneration(TilingCodeGeneration): # LMACAN: The brackets around ${tileIdxVar} are important to ensure correct order # of the modulo operation. Breaking case without the brackets is when we # put "TILING_I + 1" for tileIdxVar. - _chooseBufferTemplate = NodeTemplate(""" - switch((${tileIdxVar}) % 2) { - case 0: ${reference} = (${type})${buffer_0}; break; - case 1: ${reference} = (${type})${buffer_1}; break; + _switchOpen = NodeTemplate("switch((${tileIdxVar}) % ${bufferCount}) {") + _caseOpen = NodeTemplate("case ${case}:") + _caseClose = NodeTemplate("break;") + + _blockClose = NodeTemplate(""" } """) + _referenceUpdate = NodeTemplate("${reference} = (${type})${update};") + def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma): super().__init__(externalMemory, localMemory, dma, 2) - def _generateBufferChoice(self, reference: VariableBuffer, buffers: List[_ReferenceBuffer], - tileIdxVar: str) -> CodeSnippet: - assert len(buffers) == 2, f"Only double buffering supported. Received {len(buffers)} buffers." - operatorRepresentation = { - "tileIdxVar": tileIdxVar, - "reference": reference.name, - "type": reference._type.typeName, - "buffer_0": buffers[0].name, - "buffer_1": buffers[1].name, - } - template = self._chooseBufferTemplate - return CodeSnippet(template, operatorRepresentation) + def _switch(self, caseBlocks: List[List[CodeSnippet]], tileIdxVar: str) -> List[CodeSnippet]: + assert len(caseBlocks) == self.bufferCount, f"Expected {self.bufferCount} cases, got {len(caseBlocks)}`" + callStack = [CodeSnippet(self._switchOpen, {"tileIdxVar": tileIdxVar, "bufferCount": self.bufferCount})] + for i, block in enumerate(caseBlocks): + callStack.append(CodeSnippet(self._caseOpen, {"case": i})) + callStack.extend(block) + callStack.append(CodeSnippet(self._caseClose, {})) + callStack.append(CodeSnippet(self._blockClose, {})) + return callStack + + def _generateBufferChoice(self, reference: VariableBuffer, + buffers: List[_ReferenceBuffer]) -> List[List[CodeSnippet]]: + return [[ + CodeSnippet(self._referenceUpdate, { + "reference": reference.name, + "type": reference._type.typeName, + "update": buff.name + }) + ] for buff in buffers] def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]: - setupStatements: List[CodeSnippet] = [] - teardownStatements: List[CodeSnippet] = [] + # Double Buffering Tiling Loop Strategy + # =================================== + # - 1) Initialize all futures + # - 2) Start transfer for first input tile + # - 3) Update input reference for second tile + # - 4) for TILING_I in numTiles: + # - 4.1) Choose buffers for current tile (inputs and outputs) + # - 4.2) Input data transfer for next tile (see "4.2) Input Data Transfers") + # - 4.3) Process current tile + # - 4.4) Output data transfer for current tile (see "4.4) Output Data Transfers") + # - 5) Wait for final output tile to be ready + # - 6) Deinitialize all futures + + # 4.2) Input Data Transfers + # ----------------------------------- + # - for each input tensor: + # - 4.2.1) Wait for current input tile + # - 4.2.2) if there is a next tile: + # - 4.2.3) Choose buffers for next tile + # - 4.2.4) Start transfer for next input tile + # - 4.2.5) Update input reference for next tile + + # 4.4) Output Data Transfers + # ----------------------------------- + # - for each output tensor: + # - 4.4.1) Wait for previous output tile + # - 4.4.2) Start transfer for current output tile + # - 4.4.3) Update outut reference for next tile + setupStatements: List[CodeSnippet] = [] openLoopStatements: List[CodeSnippet] = [CodeSnippet(self._openTileLoopTemplate, {**operatorRepresentation})] - ingressDmaTransferCalls: List[CodeSnippet] = [ - CodeSnippet(self._moveTileInCheckOpenStatement, { - **operatorRepresentation, "tileIdxVar": "TILING_I+1" - }) - ] - + ingressDMAStatements: List[CodeSnippet] = [] ingressFutures: Set[Future] = set() - initialFutures: Set[Future] = set() + egressDMAStatements: List[CodeSnippet] = [] + egressFutures: Set[Future] = set() + + closeLoopStatements: List[CodeSnippet] = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})] + teardownStatements: List[CodeSnippet] = [] + + # 4.2) Input Data Transfers + # ----------------------------------- + + buffer_choices: List[List[CodeSnippet]] = [[], []] for tensorName, rectangles in dictOfArrays(tilingSchedule.inputLoadSchedule).items(): localBuffer = ctxt.lookup(operatorRepresentation[tensorName]) assert localBuffer._memoryLevel == self.localMemory @@ -98,32 +139,56 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nextLocalBufferReference = self._hoistReference(ctxt, f"{tensorName}_next", l1BuffersReferences[1]) - openLoopStatements.append(self._generateBufferChoice(localBuffer, l1BuffersReferences, "TILING_I")) - - future = self.dma.getFuture(tensorName) - ingressFutures.add(future) - - ingressDmaTransferCalls.append( - self._generateBufferChoice(nextLocalBufferReference, l1BuffersReferences, "TILING_I+1")) - ingressDmaTransferCalls.extend( - self._generateDmaTransferCalls(ctxt, tensorName, rectangles, "TILING_I+1", nextLocalBufferReference, - externalBufferRef, "ExternalToLocal", future)) + future = self.dma.getFuture(tensorName, "ExternalToLocal") + # 2) Load initial input tiles anydimAdapter = AnydimAsyncDmaTransferAdapter(self.dma) - - initialFuture = self.dma.getFuture(tensorName + "_init") - initialFutures.add(initialFuture) initialDmaTransferCalls = anydimAdapter.transfer(ctxt, externalBufferRef, localBuffer, rectangles[0].dims, stridesFromShape(externalBufferShape), stridesFromShape(rectangles[0].dims), "ExternalToLocal", - initialFuture, math.prod(externalBufferShape)) + future, math.prod(externalBufferShape)) + if future not in ingressFutures: + setupStatements.append(future.alloc()) setupStatements.extend(initialDmaTransferCalls) - setupStatements.append(initialFuture.wait()) + # 4.1) Choose buffers for current tile (inputs and outputs) + _buffer_choice = self._generateBufferChoice(localBuffer, l1BuffersReferences) + for i in range(len(buffer_choices)): + buffer_choices[i].extend(_buffer_choice[i]) + + # 4.2.1) Wait for current input tile + ingressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Wait for current input tile"})) + + if future not in ingressFutures: + ingressDMAStatements.append(future.wait()) + + # 4.2.2) if there is a next tile: + ingressDMAStatements.append( + CodeSnippet(self._moveTileInCheckOpenStatement, { + **operatorRepresentation, "tileIdxVar": "TILING_I+1" + })) + + # 4.2.3) Choose buffers for next tile + ingressDMAStatements += self._switch( + self._generateBufferChoice(nextLocalBufferReference, l1BuffersReferences), "TILING_I+1") + + # 4.2.4) Start transfer for next input tile + ingressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Transfer next input tile"})) + + # Allocate the future for the next transfer + if future not in ingressFutures: + ingressDMAStatements.append(future.alloc()) + + ingressDMAStatements.extend( + self._generateDmaTransferCalls(ctxt, tensorName, rectangles, "TILING_I+1", nextLocalBufferReference, + externalBufferRef, "ExternalToLocal", future)) + # 4.2.5) Update external reference for next til referenceUpdate = self._generateExternalReferenceUpdate(ctxt, tensorName, rectangles, "TILING_I+1", externalBufferRef) if referenceUpdate is not None: - ingressDmaTransferCalls.append(referenceUpdate) + ingressDMAStatements.append(referenceUpdate) + + # 3) Update input reference for second tile initialReferenceUpdate = CodeSnippet(referenceUpdate.template, operatorRepresentation = { **referenceUpdate.operatorRepresentation, @@ -131,12 +196,14 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, }) setupStatements.append(initialReferenceUpdate) - ingressDmaTransferCalls.append(CodeSnippet(self._moveTileInCheckCloseStatement, {})) - ingressDmaWaitStatements = [f.wait() for f in ingressFutures] + # Close the "if there is a next tile" block + ingressDMAStatements.append(CodeSnippet(self._moveTileInCheckCloseStatement, {})) - egressDmaTransferCalls: List[CodeSnippet] = [] - egressFutures: Set[Future] = set() + # Add future to the set to prevent double wait/allocation + ingressFutures.add(future) + # 4.4) Output Data Transfers + # ----------------------------------- for tensorName, rectangles in dictOfArrays(tilingSchedule.outputLoadSchedule).items(): localBuffer = ctxt.lookup(operatorRepresentation[tensorName]) assert localBuffer._memoryLevel == self.localMemory @@ -160,28 +227,54 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, tensorMemoryConstraint = nodeMemoryConstraint.outputTensorMemoryConstraints[externalBuffer.name] l1BuffersReferences = self._hoistMultibufferReferences(ctxt, localBuffer, tensorMemoryConstraint) - openLoopStatements.append(self._generateBufferChoice(localBuffer, l1BuffersReferences, "TILING_I")) + # 4.1) Choose buffers for current tile (inputs and outputs) + _buffer_choice = self._generateBufferChoice(localBuffer, l1BuffersReferences) + for i in range(len(buffer_choices)): + buffer_choices[i].extend(_buffer_choice[i]) - future = self.dma.getFuture(tensorName) - egressFutures.add(future) + # 4.4.1) Wait for previous output tile + future = self.dma.getFuture(tensorName, "LocalToExternal") + + egressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Wait for previous output tile"})) + if future not in egressFutures: + egressDMAStatements.append(future.wait()) + + # 4.4.2) Start transfer for current output tile dmaTransferCalls = self._generateDmaTransferCalls(ctxt, tensorName, rectangles, "TILING_I", localBuffer, externalBufferRef, "LocalToExternal", future) - egressDmaTransferCalls.extend(dmaTransferCalls) + egressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Transfer current output tile"})) + # Allocate the future for the next transfer + if future not in egressFutures: + egressDMAStatements.append(future.alloc()) + + egressDMAStatements.extend(dmaTransferCalls) + + # 4.4.3) Update outut reference for next tile referenceUpdate = self._generateExternalReferenceUpdate(ctxt, tensorName, rectangles, "TILING_I", externalBufferRef) if referenceUpdate is not None: - egressDmaTransferCalls.append(referenceUpdate) + egressDMAStatements.append(referenceUpdate) + + # Add future to the set to prevent double wait/allocation + egressFutures.add(future) + + # 4.2. + openLoopStatements += self._switch(buffer_choices, "TILING_I") - egressDmaWaitStatements = [f.wait() for f in egressFutures] + # 1. Initialize all futures + setupStatements = [f.init() for f in ingressFutures | egressFutures] + setupStatements + setupStatements = [CodeSnippet(self._lineComment, {"comment": "Initialize DMA future"})] + setupStatements + # 5. Wait for final output tile to be ready + teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Wait for final output tile"})) teardownStatements.extend([f.wait() for f in egressFutures]) - setupStatements = [f.init() for f in ingressFutures | initialFutures | egressFutures] + setupStatements - teardownStatements.extend(f.deinit() for f in ingressFutures | initialFutures | egressFutures) + # 6. Deinitialize all futures - closeLoopStatements = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})] + teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA future"})) + teardownStatements.extend(f.deinit() for f in ingressFutures | egressFutures) metaInfo = TilingMetaInfo( nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", @@ -195,34 +288,83 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, # which is hardcoded by the value "L1". Change this to be memory level agnostic. kernelLevelTiling = self.localMemory == "L1") - executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDmaTransferCalls, - ingressDmaWaitStatements, [], egressDmaTransferCalls, - egressDmaWaitStatements, [], [], openLoopStatements, - closeLoopStatements, setupStatements, teardownStatements) + executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements, + openLoopStatements, closeLoopStatements, setupStatements, + teardownStatements) return ctxt, executionBlock, True - def generateTilingLoop( - self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint, - tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme, - operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]: - - flatTilingSchedule = copy.copy(tilingSchedules[0]) - for tilingSchedule in tilingSchedules[1:]: - flatTilingSchedule += tilingSchedule - - offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values()) - - if len(offsetLists) == 0: - return ctxt, executionBlock, False - - for offsetList in offsetLists: - if not len(offsetList) == self.bufferCount: - return ctxt, executionBlock, False - - numTiles, tileIdxPtr = self._hoistTileNumAndIdxPtr(ctxt, tilingSchedules) - operatorRepresentation["numTiles"] = numTiles.name - operatorRepresentation["tileIdxPtr"] = tileIdxPtr.name - return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement, - operatorRepresentation) +class ProfilingDoubleBufferingTilingMixIn(PrototypeTilingMixIn, ProfilingPrototypeMixIn): + + @classmethod + def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + setupStatements: List[CodeSnippet], + teardownStatements: List[CodeSnippet]) -> ExecutionBlock: + + nodeName = metaInfo.nodeName + totalNumTiles = metaInfo.totalNumTiles + + executionBlock.addLeft(cls._measureCycles, { + "measurements": f"{nodeName}_ingress_dma_wait_start_measurements", + "tileIdxVar": 0 + }) + + executionBlock = cls.measurementArrayDeclaration(executionBlock, metaInfo, bufferingStr = "DB") + + executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, + teardownStatements) + executionBlock.addRight(cls._measureCycles, { + "measurements": f"{nodeName}_egress_dma_wait_end_measurements", + "tileIdxVar": totalNumTiles - 1 + }) + + executionBlock = cls.injectPrintCycleDiff(executionBlock, metaInfo) + + return executionBlock + + @classmethod + def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], + egressDMAStatements: List[CodeSnippet], + closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: + + nodeName = metaInfo.nodeName + tileIdxVar = metaInfo.tileIdxVar + + _openLoopStatements = [openLoopStatements[0]] + _openLoopStatements.append(CodeSnippet(cls._measureConditionSetup, {"cond": f"{tileIdxVar} > 0"})) + _openLoopStatements.append( + CodeSnippet(cls._measureCycles, { + "measurements": f"{nodeName}_ingress_dma_wait_start_measurements", + "tileIdxVar": tileIdxVar + })) + _openLoopStatements.append(CodeSnippet(cls._measureConditionEnd, {})) + _openLoopStatements += openLoopStatements[1:] + + _ingressDMAStatements = [] + _ingressDMAStatements += ingressDMAStatements + _ingressDMAStatements.append( + CodeSnippet(cls._measureCycles, { + "measurements": f"{nodeName}_ingress_dma_wait_end_measurements", + "tileIdxVar": tileIdxVar + })) + + executionBlock = cls.kernelProfilingWrap(executionBlock, metaInfo) + + _egressDMAStatements = [] + _egressDMAStatements.append( + CodeSnippet(cls._measureCycles, { + "measurements": f"{nodeName}_egress_dma_wait_start_measurements", + "tileIdxVar": f"{tileIdxVar}" + })) + _egressDMAStatements += egressDMAStatements + _egressDMAStatements.append( + CodeSnippet(cls._measureCycles, { + "measurements": f"{nodeName}_egress_dma_wait_end_measurements", + "tileIdxVar": f"{tileIdxVar}" + })) + + executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements, + _egressDMAStatements, closeLoopStatements) + return executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py index bc863e8d2..268aa8fca 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import copy from typing import Dict, List, Set, Tuple from Deeploy.AbstractDataTypes import VoidType @@ -11,7 +10,8 @@ from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import TilingMetaInfo +from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \ + PrototypeTilingMixIn, TilingMetaInfo from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme @@ -27,7 +27,6 @@ def _generateTransferScheduleCalls( TensorMemoryConstraint], tileIdxVar: str, direction: DmaDirection) -> Tuple[NetworkContext, List[CodeSnippet], Set[Future]]: callStack: List[CodeSnippet] = [] - referenceUpdates: List[CodeSnippet] = [] futures: Set[Future] = set() for tensorName, rectangles in dictOfArrays(transferSchedule).items(): @@ -50,8 +49,11 @@ def _generateTransferScheduleCalls( shape = externalBufferShape, override_type = VoidType) - future = self.dma.getFuture(tensorName) - futures.add(future) + future = self.dma.getFuture(tensorName, direction) + + # Allocate a future for this transfer + if future not in futures: + callStack.append(future.alloc()) callStack.extend( self._generateDmaTransferCalls(ctxt, tensorName, rectangles, tileIdxVar, localBuffer, externalBufferRef, @@ -62,29 +64,54 @@ def _generateTransferScheduleCalls( if referenceUpdate is not None: callStack.append(referenceUpdate) + futures.add(future) + return ctxt, callStack, futures def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]: - ctxt, ingressDmaTransferCalls, ingressFutures = self._generateTransferScheduleCalls( + + # Single Buffering Tiling Loop Strategy + # =================================== + # - 1) Initialize all futures + # - 2) for TILING_I in numTiles: + # - 2.1) Input data transfer for current tile (see "4.2) Input Data Transfers") + # - 2.2) Process current tile + # - 2.3) Output data transfer for current tile (see "4.4) Output Data Transfers") + # - 3) Deinitialize all futures + + # 2) for TILING_I in numTiles: + openLoopStatements = [CodeSnippet(self._openTileLoopTemplate, {**operatorRepresentation})] + + # 2.2) Input data transfer for current tile + ctxt, ingressDMAStatements, ingressFutures = self._generateTransferScheduleCalls( ctxt, operatorRepresentation, tilingSchedule.inputLoadSchedule, nodeMemoryConstraint.inputTensorMemoryConstraints, "TILING_I", "ExternalToLocal") - ctxt, egressDmaTransferCalls, egressFutures = self._generateTransferScheduleCalls( + + ingressDMAStatements = [CodeSnippet(self._lineComment, {"comment": "Transfer input tiles"}) + ] + ingressDMAStatements + ingressDMAStatements += [CodeSnippet(self._lineComment, {"comment": "Wait for input tiles"})] + ingressDMAStatements += [future.wait() for future in ingressFutures] + + # 2.4) Output data transfer for current tile + ctxt, egressDMAStatements, egressFutures = self._generateTransferScheduleCalls( ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule, nodeMemoryConstraint.outputTensorMemoryConstraints, "TILING_I", "LocalToExternal") + egressDMAStatements = [CodeSnippet(self._lineComment, {"comment": "Transfer output tiles"}) + ] + egressDMAStatements + egressDMAStatements += [CodeSnippet(self._lineComment, {"comment": "Wait for output tiles"})] + egressDMAStatements += [future.wait() for future in egressFutures] - ingressDmaWaitStatements = [future.wait() for future in ingressFutures] - egressDmaWaitStatements = [future.wait() for future in egressFutures] - - setupStatements = self.dma.setup() - setupStatements += [f.init() for f in ingressFutures | egressFutures] + # 1) Initialize all futures + setupStatements = [CodeSnippet(self._lineComment, {"comment": "Initialize DMA futures"})] + setupStatements.extend([f.init() for f in ingressFutures | egressFutures]) - teardownStatements = self.dma.teardown() - teardownStatements.extend(f.deinit() for f in ingressFutures | egressFutures) + # 3) Deinitialize all futures + teardownStatements = [CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA futures"})] + teardownStatements.extend([f.deinit() for f in ingressFutures | egressFutures]) - openLoopStatements = [CodeSnippet(self._openTileLoopTemplate, {**operatorRepresentation})] closeLoopStatements = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})] metaInfo = TilingMetaInfo( @@ -99,34 +126,69 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, # which is hardcoded by the value "L1". Change this to be memory level agnostic. kernelLevelTiling = self.localMemory == "L1") - executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDmaTransferCalls, - ingressDmaWaitStatements, [], egressDmaTransferCalls, - egressDmaWaitStatements, [], [], openLoopStatements, - closeLoopStatements, setupStatements, teardownStatements) + executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements, + openLoopStatements, closeLoopStatements, setupStatements, + teardownStatements) return ctxt, executionBlock, True - def generateTilingLoop( - self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint, - tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme, - operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]: - - flatTilingSchedule = copy.copy(tilingSchedules[0]) - for tilingSchedule in tilingSchedules[1:]: - flatTilingSchedule += tilingSchedule - - offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values()) - - if len(offsetLists) == 0: - return ctxt, executionBlock, False - - for offsetList in offsetLists: - if not len(offsetList) == self.bufferCount: - return ctxt, executionBlock, False - - numTiles, tileIdxPtr = self._hoistTileNumAndIdxPtr(ctxt, tilingSchedules) - operatorRepresentation["numTiles"] = numTiles.name - operatorRepresentation["tileIdxPtr"] = tileIdxPtr.name - return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement, - operatorRepresentation) +class ProfilingSingleBufferingTilingMixIn(PrototypeTilingMixIn, ProfilingPrototypeMixIn): + + @classmethod + def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + setupStatements: List[CodeSnippet], + teardownStatements: List[CodeSnippet]) -> ExecutionBlock: + + executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, + teardownStatements) + + executionBlock = cls.measurementArrayDeclaration(executionBlock, metaInfo, bufferingStr = "SB") + + executionBlock = cls.injectPrintCycleDiff(executionBlock, metaInfo) + + return executionBlock + + @classmethod + def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], + egressDMAStatements: List[CodeSnippet], + closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: + + nodeName = metaInfo.nodeName + tileIdxVar = metaInfo.tileIdxVar + + _openLoopStatements = [openLoopStatements[0]] + _openLoopStatements.append( + CodeSnippet(cls._measureCycles, { + "measurements": f"{nodeName}_ingress_dma_wait_start_measurements", + "tileIdxVar": tileIdxVar + })) + _openLoopStatements += openLoopStatements[1:] + + _ingressDMAStatements = [] + _ingressDMAStatements += ingressDMAStatements + _ingressDMAStatements.append( + CodeSnippet(cls._measureCycles, { + "measurements": f"{nodeName}_ingress_dma_wait_end_measurements", + "tileIdxVar": tileIdxVar + })) + + executionBlock = cls.kernelProfilingWrap(executionBlock, metaInfo) + + _egressDMAStatements = [] + _egressDMAStatements.append( + CodeSnippet(cls._measureCycles, { + "measurements": f"{nodeName}_egress_dma_wait_start_measurements", + "tileIdxVar": tileIdxVar + })) + _egressDMAStatements += egressDMAStatements + _egressDMAStatements.append( + CodeSnippet(cls._measureCycles, { + "measurements": f"{nodeName}_egress_dma_wait_end_measurements", + "tileIdxVar": tileIdxVar + })) + + executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements, + _egressDMAStatements, closeLoopStatements) + return executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py index 0db3109ae..a796c52f7 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py @@ -40,6 +40,8 @@ def transposeListOfLists(listOfLists: List[List[T]]) -> List[List[T]]: class TilingCodeGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn, PrototypeTilingMixIn, TilingHoistingMixIn): + _lineComment = NodeTemplate("\n// ${comment}") + _relativeOffsetReferenceUpdateTemplate = NodeTemplate(""" // UPDATE VARIABLE ${reference} ${reference} += ${relativeOffset}; @@ -62,12 +64,36 @@ class TilingCodeGeneration(CodeTransformationPass, IntrospectiveCodeTransformati """) @abstractmethod + def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, + nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, + variableReplacement: VariableReplacementScheme, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]: + pass + def generateTilingLoop( self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]: - return ctxt, executionBlock, False + flatTilingSchedule = copy.copy(tilingSchedules[0]) + for tilingSchedule in tilingSchedules[1:]: + flatTilingSchedule += tilingSchedule + + offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values()) + + if len(offsetLists) == 0: + return ctxt, executionBlock, False + + for offsetList in offsetLists: + if not len(offsetList) == self.bufferCount: + return ctxt, executionBlock, False + + numTiles, tileIdxPtr = self._hoistTileNumAndIdxPtr(ctxt, tilingSchedules) + operatorRepresentation["numTiles"] = numTiles.name + operatorRepresentation["tileIdxPtr"] = tileIdxPtr.name + + return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement, + operatorRepresentation) def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, bufferCount: int): self.externalMemory = externalMemory @@ -102,8 +128,9 @@ def _generateDmaTransferCalls(self, ctxt: NetworkContext, tensorName: str, trans initSnippets = anydimAdapter.transfer(ctxt, externalBuffer, localBuffer, transfers[0].dims, stridesFromShape(externalBuffer.shape), stridesFromShape(transfers[0].dims), direction, future, - math.prod(externalBuffer.shape)) + math.prod(externalBuffer.shape,)) + # Add allocation snippets templates = [snippet.template for snippet in initSnippets] opReprUpdates = [[] for _ in range(len(initSnippets))] diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py index 1a875b626..2f6c1e959 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from abc import ABC, abstractmethod +from abc import ABC from dataclasses import dataclass from typing import List, Literal @@ -20,52 +20,12 @@ class TilingMetaInfo: kernelLevelTiling: bool -_CodeSegmentType = List[CodeSnippet] - -_measureCycles = NodeTemplate(""" -${measurements}[${tileIdxVar}] = getCycles(); -""") - -_measurementArrayDeclaration = NodeTemplate(""" -uint32_t ${measurements}[${totalNumTiles}]; -""") - -_stringDeclaration = NodeTemplate(""" -const static char ${name}[] = "${string}"; -""") - -_measureConditionSetup = NodeTemplate(""" -if(${cond}){ -""") - -_measureConditionEnd = NodeTemplate(""" -} -""") - -_printLoopSetup = NodeTemplate(""" -StopTimer(); -for (int ${profileIdxVar} = ((*${tileIdxPtr} > 0) ? ${numTiles}[(*${tileIdxPtr} - 1)] : 0); - ${profileIdxVar} < ${numTiles}[*${tileIdxPtr}]; - ${profileIdxVar}++){ -""") - -_printCycleDifference = NodeTemplate(r""" -printf("%s%u] %s%u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \ -${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); -""") - -_printLoopTeardown = NodeTemplate(""" -} -StartTimer(); -""") - - class PrototypeTilingMixIn(ABC): @classmethod def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - setupStatements: _CodeSegmentType, - teardownStatements: _CodeSegmentType) -> ExecutionBlock: + setupStatements: List[CodeSnippet], + teardownStatements: List[CodeSnippet]) -> ExecutionBlock: for transaction in reversed(setupStatements): executionBlock.addLeft(transaction.template, transaction.operatorRepresentation) @@ -77,53 +37,70 @@ def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: @classmethod def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - openLoopStatements: _CodeSegmentType, closeLoopStatements: _CodeSegmentType) -> ExecutionBlock: + openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], + egressDMAStatements: List[CodeSnippet], + closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: - for transaction in reversed(openLoopStatements): + for transaction in reversed(openLoopStatements + ingressDMAStatements): executionBlock.addLeft(transaction.template, transaction.operatorRepresentation) - for transaction in closeLoopStatements: + for transaction in egressDMAStatements + closeLoopStatements: executionBlock.addRight(transaction.template, transaction.operatorRepresentation) return executionBlock @classmethod def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType, - ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType, - egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType, - variableUpdates: _CodeSegmentType, openLoopStatements: _CodeSegmentType, - closeLoopStatements: _CodeSegmentType, setupStatements: _CodeSegmentType, - teardownStatements: _CodeSegmentType) -> ExecutionBlock: - - if not hasattr(cls, "generateInnerCode"): - raise Exception("You need to mix in a code gen strategy!") - - newExecutionBlock = cls.generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls, - ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls, - egressDMAWaitStatements, egressDMAUpdates, variableUpdates) + ingressDMAStatements: List[CodeSnippet], egressDMAStatements: List[CodeSnippet], + openLoopStatements: List[CodeSnippet], closeLoopStatements: List[CodeSnippet], + setupStatements: List[CodeSnippet], + teardownStatements: List[CodeSnippet]) -> ExecutionBlock: - newExecutionBlock = cls.generateLoopCode(newExecutionBlock, metaInfo, openLoopStatements, closeLoopStatements) + executionBlock = cls.generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements, + egressDMAStatements, closeLoopStatements) - newExecutionBlock = cls.generateSetupAndTeardownCode(newExecutionBlock, metaInfo, setupStatements, - teardownStatements) - - return newExecutionBlock - - -class TilingCodeGenMixin(ABC): - - @abstractmethod - def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType, - ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType, - egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType, - variableUpdates: _CodeSegmentType) -> ExecutionBlock: + executionBlock = cls.generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, teardownStatements) return executionBlock class ProfilingPrototypeMixIn(ABC): + _measureCycles = NodeTemplate(""" + ${measurements}[${tileIdxVar}] = getCycles(); + """) + + _measurementArrayDeclaration = NodeTemplate(""" + uint32_t ${measurements}[${totalNumTiles}]; + """) + + _stringDeclaration = NodeTemplate(""" + const static char ${name}[] = "${string}"; + """) + + _printLoopSetup = NodeTemplate(""" + StopTimer(); + for (int ${profileIdxVar} = ((*${tileIdxPtr} > 0) ? ${numTiles}[(*${tileIdxPtr} - 1)] : 0); + ${profileIdxVar} < ${numTiles}[*${tileIdxPtr}]; + ${profileIdxVar}++){ + """) + + _printCycleDifference = NodeTemplate(r""" + printf("%s%u] %s%u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \ + ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + """) + + _printLoopTeardown = NodeTemplate(""" + } + StartTimer(); + """) + + _measureConditionSetup = NodeTemplate(""" + if(${cond}){ + """) + + _measureConditionEnd = NodeTemplate(""" + } + """) @classmethod def measurementArrayDeclaration(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, @@ -142,17 +119,17 @@ def measurementArrayDeclaration(cls, executionBlock: ExecutionBlock, metaInfo: T measurementsList = ["kernel_start", "kernel_end"] + measurementsList for measurements in measurementsList: - executionBlock.addLeft(_measurementArrayDeclaration, { + executionBlock.addLeft(cls._measurementArrayDeclaration, { "measurements": f"{nodeName}_{measurements}_measurements", "totalNumTiles": totalNumTiles }) - executionBlock.addLeft(_stringDeclaration, { + executionBlock.addLeft(cls._stringDeclaration, { "name": f"{nodeName}_prefix", "string": f"[{nodeName}][{bufferingStr}][{nodeOps} ops][Tile ", }) - executionBlock.addLeft(_stringDeclaration, { + executionBlock.addLeft(cls._stringDeclaration, { "name": f"{nodeName}_suffix", "string": " cycles \\n", }) @@ -165,10 +142,9 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe numTiles = metaInfo.numTiles nodeName = metaInfo.nodeName tileIdxPtr = metaInfo.tileIdxPtr - totalNumTiles = metaInfo.totalNumTiles profileIdxVar = "PROFILING_I" - executionBlock.addRight(_printLoopSetup, { + executionBlock.addRight(cls._printLoopSetup, { "numTiles": numTiles, "nodeName": nodeName, "profileIdxVar": profileIdxVar, @@ -176,7 +152,7 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe }) executionBlock.addRight( - _printCycleDifference, { + cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", "flavorStr": "Input DMA took ", @@ -187,7 +163,7 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe if metaInfo.kernelLevelTiling: executionBlock.addRight( - _printCycleDifference, { + cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", "flavorStr": "Kernel took ", @@ -197,7 +173,7 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe }) executionBlock.addRight( - _printCycleDifference, { + cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", "flavorStr": "Output DMA took ", @@ -206,7 +182,7 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe "profileIdxVar": profileIdxVar, }) - executionBlock.addRight(_printLoopTeardown, {}) + executionBlock.addRight(cls._printLoopTeardown, {}) return executionBlock @@ -216,218 +192,13 @@ def kernelProfilingWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMet tileIdxVar = metaInfo.tileIdxVar if metaInfo.kernelLevelTiling: - executionBlock.addLeft(_measureCycles, { + executionBlock.addLeft(cls._measureCycles, { "measurements": f"{nodeName}_kernel_start_measurements", "tileIdxVar": tileIdxVar }) - executionBlock.addRight(_measureCycles, { + executionBlock.addRight(cls._measureCycles, { "measurements": f"{nodeName}_kernel_end_measurements", "tileIdxVar": tileIdxVar }) return executionBlock - - -class SingleBufferingTilingMixIn(PrototypeTilingMixIn, TilingCodeGenMixin): - - @classmethod - def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType, - ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType, - egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType, - variableUpdates: _CodeSegmentType) -> ExecutionBlock: - - # Structure: - # Update DMA Structs - # Transfer in tiles (async) - # Update tile variables - # Wait for tiles - - # Kernel execution - - # Update DMA Structs - # Transfer out tiles (async) - # Wait for out transfers - - for transaction in reversed(ingressDMAUpdates + ingressDMATransferCalls + variableUpdates + - ingressDMAWaitStatements): - executionBlock.addLeft(transaction.template, transaction.operatorRepresentation) - - for transaction in (egressDMAUpdates + egressDMATransferCalls + egressDMAWaitStatements): - executionBlock.addRight(transaction.template, transaction.operatorRepresentation) - - return executionBlock - - -class ProfilingSingleBufferingTilingMixIn(SingleBufferingTilingMixIn, ProfilingPrototypeMixIn): - - @classmethod - def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - setupStatements: _CodeSegmentType, - teardownStatements: _CodeSegmentType) -> ExecutionBlock: - - executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, - teardownStatements) - - executionBlock = cls.measurementArrayDeclaration(executionBlock, metaInfo, bufferingStr = "SB") - - executionBlock = cls.injectPrintCycleDiff(executionBlock, metaInfo) - - return executionBlock - - @classmethod - def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType, - ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType, - egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType, - variableUpdates: _CodeSegmentType) -> ExecutionBlock: - - nodeName = metaInfo.nodeName - tileIdxVar = metaInfo.tileIdxVar - - executionBlock = cls.kernelProfilingWrap(executionBlock, metaInfo) - - _ingressDMAWaitStatements = [] - _ingressDMAWaitStatements.append( - CodeSnippet(_measureCycles, { - "measurements": f"{nodeName}_ingress_dma_wait_start_measurements", - "tileIdxVar": tileIdxVar - })) - _ingressDMAWaitStatements += ingressDMAWaitStatements - _ingressDMAWaitStatements.append( - CodeSnippet(_measureCycles, { - "measurements": f"{nodeName}_ingress_dma_wait_end_measurements", - "tileIdxVar": tileIdxVar - })) - - _egressDMAWaitStatements = [] - _egressDMAWaitStatements.append( - CodeSnippet(_measureCycles, { - "measurements": f"{nodeName}_egress_dma_wait_start_measurements", - "tileIdxVar": tileIdxVar - })) - _egressDMAWaitStatements += egressDMAWaitStatements - _egressDMAWaitStatements.append( - CodeSnippet(_measureCycles, { - "measurements": f"{nodeName}_egress_dma_wait_end_measurements", - "tileIdxVar": tileIdxVar - })) - - executionBlock = super().generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls, - _ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls, - _egressDMAWaitStatements, egressDMAUpdates, variableUpdates) - - return executionBlock - - -class DoubleBufferingTilingMixIn(PrototypeTilingMixIn, TilingCodeGenMixin): - - @classmethod - def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType, - ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType, - egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType, - variableUpdates: _CodeSegmentType) -> ExecutionBlock: - - # Structure: - - # Update input DMA Structs - # Update tile variables - # Wait for current input tiles - # Transfer in next input tiles (async) - # Update output DMA Structs - # Wait for current output tiles - - # Kernel execution - - # Transfer out tiles (async) - - for transaction in reversed(ingressDMAWaitStatements + ingressDMAUpdates + ingressDMATransferCalls + - variableUpdates + egressDMAWaitStatements + egressDMAUpdates): - executionBlock.addLeft(transaction.template, transaction.operatorRepresentation) - - for transaction in egressDMATransferCalls: - executionBlock.addRight(transaction.template, transaction.operatorRepresentation) - - return executionBlock - - -class ProfilingDoubleBufferingTilingMixIn(DoubleBufferingTilingMixIn, ProfilingPrototypeMixIn): - - @classmethod - def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - setupStatements: _CodeSegmentType, - teardownStatements: _CodeSegmentType) -> ExecutionBlock: - - nodeName = metaInfo.nodeName - totalNumTiles = metaInfo.totalNumTiles - - executionBlock.addLeft(_measureCycles, { - "measurements": f"{nodeName}_ingress_dma_wait_start_measurements", - "tileIdxVar": 0 - }) - - executionBlock = cls.measurementArrayDeclaration(executionBlock, metaInfo, bufferingStr = "DB") - - executionBlock.addRight(_measureCycles, { - "measurements": f"{nodeName}_egress_dma_wait_start_measurements", - "tileIdxVar": totalNumTiles - 1 - }) - executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, - teardownStatements) - executionBlock.addRight(_measureCycles, { - "measurements": f"{nodeName}_egress_dma_wait_end_measurements", - "tileIdxVar": totalNumTiles - 1 - }) - - executionBlock = cls.injectPrintCycleDiff(executionBlock, metaInfo) - - return executionBlock - - @classmethod - def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType, - ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType, - egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType, - variableUpdates: _CodeSegmentType) -> ExecutionBlock: - - nodeName = metaInfo.nodeName - tileIdxVar = metaInfo.tileIdxVar - - executionBlock = cls.kernelProfilingWrap(executionBlock, metaInfo) - - _ingressDMAWaitStatements = [] - _ingressDMAWaitStatements.append(CodeSnippet(_measureConditionSetup, {"cond": f"{tileIdxVar} > 0"})) - _ingressDMAWaitStatements.append( - CodeSnippet(_measureCycles, { - "measurements": f"{nodeName}_ingress_dma_wait_start_measurements", - "tileIdxVar": tileIdxVar - })) - _ingressDMAWaitStatements.append(CodeSnippet(_measureConditionEnd, {})) - _ingressDMAWaitStatements += ingressDMAWaitStatements - _ingressDMAWaitStatements.append( - CodeSnippet(_measureCycles, { - "measurements": f"{nodeName}_ingress_dma_wait_end_measurements", - "tileIdxVar": tileIdxVar - })) - - _egressDMAWaitStatements = [] - _egressDMAWaitStatements.append(CodeSnippet(_measureConditionSetup, {"cond": f"{tileIdxVar} > 0"})) - _egressDMAWaitStatements.append( - CodeSnippet(_measureCycles, { - "measurements": f"{nodeName}_egress_dma_wait_start_measurements", - "tileIdxVar": f"{tileIdxVar} - 1" - })) - _egressDMAWaitStatements += egressDMAWaitStatements - _egressDMAWaitStatements.append( - CodeSnippet(_measureCycles, { - "measurements": f"{nodeName}_egress_dma_wait_end_measurements", - "tileIdxVar": f"{tileIdxVar} - 1" - })) - _egressDMAWaitStatements.append(CodeSnippet(_measureConditionEnd, {})) - - executionBlock = super().generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls, - _ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls, - _egressDMAWaitStatements, egressDMAUpdates, variableUpdates) - - return executionBlock diff --git a/DeeployTest/Platforms/Snitch/main.c b/DeeployTest/Platforms/Snitch/main.c index 89e655143..a7251f384 100644 --- a/DeeployTest/Platforms/Snitch/main.c +++ b/DeeployTest/Platforms/Snitch/main.c @@ -77,23 +77,20 @@ int main(void) { snrt_cluster_hw_barrier(); -#if !defined(BANSHEE_SIMULATION) && !defined(GVSOC_SIMULATION) - if (snrt_is_dm_core()) { - ResetTimer(); - StartTimer(); - } -#endif // BANSHEE_SIMULATION and GVSOC_SIMULATION - + ResetTimer(); + StartTimer(); + snrt_cluster_hw_barrier(); RunNetwork(compute_core_id, num_compute_cores); uint32_t runtimeCycles = 0; -#if !defined(BANSHEE_SIMULATION) && !defined(GVSOC_SIMULATION) if (snrt_is_dm_core()) { runtimeCycles = getCycles(); +#if !defined(BANSHEE_SIMULATION) && !defined(GVSOC_SIMULATION) DUMP(runtimeCycles); - StopTimer(); - } #endif // BANSHEE_SIMULATION and GVSOC_SIMULATION + } + + StopTimer(); snrt_cluster_hw_barrier(); diff --git a/TargetLibraries/PULPOpen/inc/mchan_v6.h b/TargetLibraries/PULPOpen/inc/mchan_v6.h index bf7cfc1dd..34a42d882 100644 --- a/TargetLibraries/PULPOpen/inc/mchan_v6.h +++ b/TargetLibraries/PULPOpen/inc/mchan_v6.h @@ -7,6 +7,8 @@ #ifndef __MCHAN_V6_H__ #define __MCHAN_V6_H__ +#include "assert.h" + // Requires to have MCHAN_BASE_ADDR, MCHAN_EVENT defined outside of header #ifndef MCHAN_BASE_ADDR #error "[mchan_v6.h] MCHAN_BASE_ADDR not defined!" @@ -34,6 +36,7 @@ #include "pmsis.h" #define MCHAN_TRANSFER_LEN_SIZE (16) +#define MCHAN_CHANNEL_ID_MAX (15) #define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0)) #define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0)) @@ -68,17 +71,17 @@ static void mchan_transfer_2d_ext_strided(uint32_t cmd, void *loc, void *ext, static uint32_t mchan_channel_alloc() { return *cmd_ptr; } static void mchan_channel_free(uint32_t channel_id) { - // TODO: assert channel_id is smaller then 32 + assert(channel_id <= MCHAN_CHANNEL_ID_MAX); *status_ptr = 1 << channel_id; } static uint32_t mchan_channel_is_busy(uint32_t channel_id) { - // TODO: assert channel_id is smaller then 32 + assert(channel_id <= MCHAN_CHANNEL_ID_MAX); return *status_ptr & (1 << channel_id); } static void mchan_channel_wait(uint32_t channel_id) { - // TODO: assert channel_id is smaller then 32 + assert(channel_id <= MCHAN_CHANNEL_ID_MAX); #if defined(MCHAN_EVENT) while (mchan_channel_is_busy(channel_id)) eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT); diff --git a/TargetLibraries/PULPOpen/inc/mchan_v7.h b/TargetLibraries/PULPOpen/inc/mchan_v7.h index 1078584b5..32ef836f3 100644 --- a/TargetLibraries/PULPOpen/inc/mchan_v7.h +++ b/TargetLibraries/PULPOpen/inc/mchan_v7.h @@ -7,6 +7,8 @@ #ifndef __MCHAN_V7_H__ #define __MCHAN_V7_H__ +#include "assert.h" + // Requires to have MCHAN_BASE_ADDR, MCHAN_EVENT defined outside of header #ifndef MCHAN_BASE_ADDR #error "[mchan_v7.h] MCHAN_BASE_ADDR not defined!" @@ -34,6 +36,7 @@ #include "pmsis.h" #define MCHAN_TRANSFER_LEN_SIZE (17) +#define MCHAN_CHANNEL_ID_MAX (15) #define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0)) #define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0)) @@ -94,17 +97,17 @@ static void mchan_transfer_2d_loc_strided_ext_strided( static uint32_t mchan_channel_alloc() { return *cmd_ptr; } static void mchan_channel_free(uint32_t channel_id) { - // TODO: assert tid is smaller then 32 + assert(channel_id <= MCHAN_CHANNEL_ID_MAX); *status_ptr = 1 << channel_id; } static uint32_t mchan_channel_is_busy(uint32_t channel_id) { - // TODO: assert tid is smaller then 32 + assert(channel_id <= MCHAN_CHANNEL_ID_MAX); return *status_ptr & (1 << channel_id); } static void mchan_channel_wait(uint32_t channel_id) { - // TODO: assert tid is smaller then 32 + assert(channel_id <= MCHAN_CHANNEL_ID_MAX); #if defined(MCHAN_EVENT) while (mchan_channel_is_busy(channel_id)) eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT); diff --git a/TargetLibraries/Snitch/src/CycleCounter.c b/TargetLibraries/Snitch/src/CycleCounter.c index 47724816f..3861c421c 100644 --- a/TargetLibraries/Snitch/src/CycleCounter.c +++ b/TargetLibraries/Snitch/src/CycleCounter.c @@ -14,7 +14,7 @@ static uint32_t instr_end[NUM_CORES] __attribute__((section(".l1"))); static uint32_t running[NUM_CORES] __attribute__((section(".l1"))); void ResetTimer() { - // snrt_reset_perf_counter(SNRT_PERF_CNT0); + snrt_reset_perf_counter(SNRT_PERF_CNT0); uint32_t const core_id = snrt_global_core_idx(); uint32_t _timer_init = read_csr(mcycle); uint32_t _instr_init = read_csr(minstret); @@ -26,7 +26,9 @@ void ResetTimer() { } void StartTimer() { - // snrt_start_perf_counter(SNRT_PERF_CNT0, SNRT_PERF_CNT_CYCLES, 0); + if (snrt_is_dm_core()) { + snrt_start_perf_counter(SNRT_PERF_CNT0, SNRT_PERF_CNT_CYCLES, 0); + } uint32_t const core_id = snrt_global_core_idx(); timer_init[core_id] = read_csr(mcycle); instr_init[core_id] = read_csr(minstret); @@ -34,17 +36,16 @@ void StartTimer() { } void StopTimer() { - // if (!snrt_is_dm_core()) { - // snrt_stop_perf_counter(SNRT_PERF_CNT0); - // } + if (snrt_is_dm_core()) { + snrt_stop_perf_counter(SNRT_PERF_CNT0); + } uint32_t const core_id = snrt_global_core_idx(); timer_end[core_id] = read_csr(mcycle); - timer_end[core_id] = read_csr(minstret); + instr_end[core_id] = read_csr(minstret); running[core_id] = 0; } uint32_t getCycles() { - // return snrt_get_perf_counter(SNRT_PERF_CNT0); uint32_t const core_id = snrt_global_core_idx(); if (running[core_id]) { return read_csr(mcycle) - timer_init[core_id];