Skip to content

Commit 7f99f2c

Browse files
committed
Adapation for Merging Devel
1 parent 3271c3a commit 7f99f2c

File tree

260 files changed

+347
-13617
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

260 files changed

+347
-13617
lines changed

Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,14 @@
2828
referenceTemplate = NodeTemplate("""
2929
// GELU (Name: ${nodeName}, Op: ${nodeOp})
3030
PULP_GELU_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size});
31+
""")
32+
33+
referenceGradTemplate = NodeTemplate("""
34+
// GELU Parallel (Name: ${nodeName}, Op: ${nodeOp})
35+
int8_t ${nodeName}_core_id = pi_core_id();
36+
int8_t ${nodeName}_log2Core = log2(NUM_CORES);
37+
int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
38+
int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
39+
int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
40+
GELU_fp${data_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}_sigmoid_grad_chunk(${grad_in}, ${data_in}, ${grad_out}, ${nodeName}_chunk_start, ${nodeName}_chunk_stop);
3141
""")

Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,38 @@
2222
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2323
# See the License for the specific language governing permissions and
2424
# limitations under the Licens
25-
from Deeploy.DeeployTypes import NodeTemplate
25+
from Deeploy.DeeployTypes import NodeTemplate, NetworkContext, OperatorRepresentation
26+
from Deeploy.AbstractDataTypes import float32_tPtr
27+
from typing import Tuple, Dict, List
2628

27-
referenceTemplate = NodeTemplate("""
29+
class PULPFloatGEMMTemplate(NodeTemplate):
30+
31+
def __init__(self, templateStr):
32+
super().__init__(templateStr)
33+
34+
def alignToContext(self, ctxt: NetworkContext,
35+
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
36+
37+
if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None:
38+
# No bias case - set C to NULL and provide a default type
39+
operatorRepresentation['C'] = None
40+
operatorRepresentation['C_type'] = float32_tPtr # Default to fp32 type
41+
42+
return ctxt, operatorRepresentation, []
43+
44+
referenceTemplate = PULPFloatGEMMTemplate("""
2845
// GEMM (Name: ${nodeName}, Op: ${nodeOp})
2946
${A_type.typeName} ref_${data_out}_${A} = ${A};
3047
${B_type.typeName} ref_${data_out}_${B} = ${B};
48+
% if C is not None:
3149
${C_type.typeName} ref_${data_out}_${C} = ${C};
50+
% else:
51+
${C_type.typeName} ref_${data_out}_C = NULL;
52+
% endif
3253
${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
3354
3455
for(uint32_t i=0; i<${batch}; i++){
56+
% if C is not None:
3557
PULP_Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
3658
ref_${data_out}_${A},
3759
ref_${data_out}_${B},
@@ -43,10 +65,25 @@
4365
${transA},
4466
${transB}
4567
);
68+
% else:
69+
PULP_Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
70+
ref_${data_out}_${A},
71+
ref_${data_out}_${B},
72+
NULL,
73+
ref_${data_out}_${data_out},
74+
${M},
75+
${N},
76+
${O},
77+
${transA},
78+
${transB}
79+
);
80+
% endif
4681
4782
ref_${data_out}_${A} += ${M} * ${N};
4883
ref_${data_out}_${B} += ${N} * ${O};
84+
% if C is not None:
4985
ref_${data_out}_${C} += ${M} * ${O};
86+
% endif
5087
ref_${data_out}_${data_out} += ${M} * ${O};
5188
}
5289
""")

Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,38 @@
3636
${size},
3737
${lastDimLength}
3838
);
39+
""")
40+
41+
referenceGradTemplate = NodeTemplate("""
42+
// FloatLayernormGrad Parallel (Name: ${nodeName}, Op: ${nodeOp})
43+
44+
int8_t ${nodeName}_core_id = pi_core_id();
45+
int8_t ${nodeName}_log2Core = log2(NUM_CORES);
46+
47+
int32_t ${nodeName}_seq_length = ${size} / ${lastDimLength};
48+
int32_t ${nodeName}_chunk = (${nodeName}_seq_length >> ${nodeName}_log2Core) +
49+
((${nodeName}_seq_length & (NUM_CORES-1)) != 0);
50+
int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${nodeName}_seq_length);
51+
int32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${nodeName}_seq_length);
52+
53+
int32_t ${nodeName}_elem_start = ${nodeName}_start * ${lastDimLength};
54+
int32_t ${nodeName}_elem_end = ${nodeName}_end * ${lastDimLength};
55+
int32_t ${nodeName}_elem_count = ${nodeName}_elem_end - ${nodeName}_elem_start;
56+
57+
const float* ${nodeName}_grad_in_ptr = ${grad_in} + ${nodeName}_elem_start;
58+
const float* ${nodeName}_data_in_ptr = ${data_in} + ${nodeName}_elem_start;
59+
float* ${nodeName}_grad_out_ptr = ${grad_out} + ${nodeName}_elem_start;
60+
61+
if (${nodeName}_elem_count > 0) {
62+
LayernormGrad_fp${grad_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}(
63+
${nodeName}_grad_in_ptr, // Upstream gradient (dy)
64+
${nodeName}_data_in_ptr, // Original input (x)
65+
${nodeName}_grad_out_ptr, // Output gradient (dx)
66+
${weight}, // Input Scale parameter
67+
${bias}, // Input Bias parameter
68+
${epsilon}, // Epsilon for numerical stability
69+
${nodeName}_elem_count, // Number of elements to process
70+
${lastDimLength} // Size of the feature dimension
71+
);
72+
}
3973
""")

Deeploy/Targets/Redmule/Templates/GEMMTemplate.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,26 @@
2222
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2323
# See the License for the specific language governing permissions and
2424
# limitations under the Licens
25-
from Deeploy.DeeployTypes import NodeTemplate
25+
from Deeploy.DeeployTypes import NodeTemplate, NetworkContext, OperatorRepresentation
26+
from Deeploy.AbstractDataTypes import float32_tPtr
27+
from typing import Tuple, Dict, List
2628

27-
referenceTemplate = NodeTemplate("""
29+
class RedMuleGEMMTemplate(NodeTemplate):
30+
31+
def __init__(self, templateStr):
32+
super().__init__(templateStr)
33+
34+
def alignToContext(self, ctxt: NetworkContext,
35+
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
36+
37+
if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None:
38+
# No bias case - set C to NULL and provide a default type
39+
operatorRepresentation['C'] = None
40+
operatorRepresentation['C_type'] = float32_tPtr # Default to fp32 type
41+
42+
return ctxt, operatorRepresentation, []
43+
44+
referenceTemplate = RedMuleGEMMTemplate("""
2845
// GEMM using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp})
2946
3047
int8_t ${nodeName}_core_id = pi_core_id();
@@ -33,10 +50,13 @@
3350
for(uint32_t b=0; b<${batch}; b++) {
3451
${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
3552
${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
53+
% if C is not None:
3654
${C_type.typeName} batch_C = ${C} + b * ${M} * ${O};
55+
% endif
3756
${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
3857
39-
% if beta == 0:
58+
% if C is None or beta == 0:
59+
// No bias or beta=0: use MatMul
4060
MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule(
4161
(const float32_t *) batch_A,
4262
(const float32_t *) batch_B,
@@ -46,6 +66,7 @@
4666
${O}
4767
);
4868
% else:
69+
// With bias and beta!=0: use Gemm
4970
Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule(
5071
(const float32_t *) batch_A,
5172
(const float32_t *) batch_B,
@@ -58,5 +79,4 @@
5879
% endif
5980
}
6081
}
61-
"""
62-
)
82+
""")

Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
# ----------------------------------------------------------------------
32
#
43
# File: GEMMTileConstraint.py
@@ -8,8 +7,7 @@
87
# Copyright (C) 2023, ETH Zurich and University of Bologna.
98
#
109
# Author:
11-
# - Victor Jung, [email protected], ETH Zurich
12-
# - Moritz Scherer, [email protected], ETH Zurich
10+
# - Run Wang, ETH Zurich
1311
#
1412
# ----------------------------------------------------------------------
1513
# SPDX-License-Identifier: Apache-2.0
@@ -46,16 +44,23 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
4644
# Get to-be-tiled tensor's buffers
4745
bufferA = ctxt.lookup(name = parseDict['A'])
4846
bufferB = ctxt.lookup(name = parseDict['B'])
49-
bufferC = ctxt.lookup(name = parseDict['C'])
5047
outputBuffer = ctxt.lookup(name = parseDict['data_out'])
5148

49+
# Check if bias exists
50+
has_bias = 'C' in parseDict and parseDict['C'] is not None
51+
if has_bias:
52+
bufferC = ctxt.lookup(name = parseDict['C'])
53+
5254
# Add I/O dimensions to the model as variables
53-
for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]:
55+
tensor_names = [bufferA.name, bufferB.name, outputBuffer.name]
56+
if has_bias:
57+
tensor_names.append(bufferC.name)
58+
59+
for bufferName in tensor_names:
5460
tilerModel.addTensorDimToModel(ctxt, bufferName)
5561

5662
dimOffsetA = len(bufferA.shape) - 2
5763
dimOffsetB = len(bufferB.shape) - 2
58-
dimOffsetC = len(bufferC.shape) - 2
5964
dimOffsetOut = len(outputBuffer.shape) - 2
6065

6166
AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
@@ -74,10 +79,13 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
7479
# Add GEMM Geometrical constraints
7580
tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
7681

77-
addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
78-
addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
79-
tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
80-
tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
82+
# Add bias constraints only if bias exists
83+
if has_bias:
84+
dimOffsetC = len(bufferC.shape) - 2
85+
addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
86+
addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
87+
tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
88+
tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
8189

8290
return tilerModel
8391

@@ -114,7 +122,15 @@ def serializeTilingSolution(
114122
operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
115123
outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
116124

117-
addrNames = ['A', 'B', 'C', 'data_out']
125+
# Check if bias exists
126+
has_bias = 'C' in operatorRepresentation and operatorRepresentation['C'] is not None
127+
128+
# Adjust address names based on bias existence
129+
if has_bias:
130+
addrNames = ['A', 'B', 'C', 'data_out']
131+
else:
132+
addrNames = ['A', 'B', 'data_out']
133+
118134
inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
119135
operatorRepresentation, addrNames)
120136

@@ -169,11 +185,13 @@ def serializeTilingSolution(
169185
else:
170186
BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
171187

172-
CCube = HyperRectangle(cube.offset, cube.dims)
173-
174188
inputACubes.append(ACube)
175189
inputBCubes.append(BCube)
176-
inputAddCubes.append(CCube)
190+
191+
# Only add bias cubes if bias exists
192+
if has_bias:
193+
CCube = HyperRectangle(cube.offset, cube.dims)
194+
inputAddCubes.append(CCube)
177195

178196
inputLoadSchedule = []
179197
outputLoadSchedule = []
@@ -187,12 +205,17 @@ def serializeTilingSolution(
187205
"batch": PointerClass(uint8_t)
188206
}
189207

190-
for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
191-
inputLoadSchedule.append({"A": a, "B": b, "C": c})
208+
# Create input load schedule based on bias existence
209+
if has_bias:
210+
for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
211+
inputLoadSchedule.append({"A": a, "B": b, "C": c})
212+
else:
213+
for a, b in zip(inputACubes, inputBCubes):
214+
inputLoadSchedule.append({"A": a, "B": b})
192215

193216
for out in outputCubes:
194217
outputLoadSchedule.append({"data_out": out})
195218

196219
schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
197220

198-
return VariableReplacementScheme(replacements, replacementTypes), schedule
221+
return VariableReplacementScheme(replacements, replacementTypes), schedule

Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ class TilingMetaInfo:
4242
_CodeSegmentType = List[CodeSnippet]
4343

4444
_measureCycles = NodeTemplate("""
45-
${nodeName}_${measurementName}_measurements[${tileIdx}] = getCycles();
45+
${nodeName}_${measurementName}_t[${tileIdx}] = getCycles();
4646
""")
4747

4848
_measurementArrayDeclaration = NodeTemplate("""
49-
uint32_t ${nodeName}_${measurementName}_measurements[${numTiles}];
49+
static uint32_t ${nodeName}_${measurementName}_t[${numTiles}];
5050
""")
5151

5252
_printPrefixAndSufixDeclaration = NodeTemplate("""
@@ -74,7 +74,7 @@ class TilingMetaInfo:
7474
""")
7575
_printCycleDifference = NodeTemplate(r"""
7676
printf("%s%u] %s%u%s", ${nodeName}_prefix,${tileIdx},"${flavorStr}", \
77-
${nodeName}_${endMeasurementName}_measurements[${tileIdx}] - ${nodeName}_${startMeasurementName}_measurements[${tileIdx}],${nodeName}_suffix);
77+
${nodeName}_${endMeasurementName}_t[${tileIdx}] - ${nodeName}_${startMeasurementName}_t[${tileIdx}],${nodeName}_suffix);
7878
""")
7979

8080
_printLoopTeardown = NodeTemplate("""

0 commit comments

Comments
 (0)