diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc index 2a5b7fecc..80b0e6599 100644 --- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc @@ -565,14 +565,6 @@ void GptContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_l } } - if (int8_mode_ == 1) { - // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max - // possible memory that would be required by any of the individual gemms. - const int max_size = std::max(hidden_units_, 3 * local_hidden_units_); - mixed_gemm_ws_bytes_ = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size); - mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false); - } - if (int8_mode_ == 1) { // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max // possible memory that would be required by any of the individual gemms.