diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02023951b1a..c3b2f2f5d3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,9 @@ message(STATUS "CMAKE GENERATOR: ${CMAKE_GENERATOR}")
 set(THIRD_PARTY_PATH "${PADDLE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
+# Simd configurations
+include(simd)
+
 if(WIN32)
   include(os/windows)
 endif()
@@ -51,12 +54,14 @@ lite_option(WITH_AVX                           "Compile PaddlePaddle with AVX in
 lite_option(WITH_TESTING                       "Compile PaddlePaddle with unit testing"                               OFF)
 lite_option(WITH_MKL                           "Compile PaddlePaddle with MKL support."                               ON IF ${AVX_FOUND})
 lite_option(WITH_ARM_DOTPROD                   "Compile PaddlePaddle with ARM dot production"                         ON)
+lite_option(WITH_LASX                          "Compile PaddlePaddle with LoongArch ASX intrinsics"                   ON IF ${LASX_FOUND})
 lite_option(WITH_SYSTEM_BLAS                   "Use system blas library"                                              OFF)
 # for lite, both server and mobile framework.
 lite_option(LITE_WITH_JAVA                     "Enable Java JNI lib in lite mode"                                     OFF)
 lite_option(LITE_WITH_STATIC_LIB               "Enable static cplus lib in lite mode"                                 OFF)
 lite_option(LITE_WITH_PYTHON                   "Enable Python api lib in lite mode"                                   OFF)
 lite_option(LITE_WITH_X86                      "Enable X86 in lite mode"                                              ON)
+lite_option(LITE_WITH_LOONGARCH                "Enable LoongArch in lite mode"                                        OFF)
 lite_option(LITE_WITH_ARM                      "Enable ARM in lite mode"                                              OFF)
 lite_option(LITE_WITH_SW                       "Enable SW in lite mode"                                               OFF)
 lite_option(LITE_WITH_NNADAPTER                "Enable NNAdapter in lite mode"                                        OFF)
@@ -92,9 +97,6 @@ lite_option(LITE_WITH_ARM_DNN_LIBRARY          "Use Arm DNN library instead of b
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 
-# Simd configurations
-include(simd)
-
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
   if(WIN32)
diff --git a/cmake/backends/common.cmake b/cmake/backends/common.cmake
index 44897d9e57b..db828faa0dd 100644
--- a/cmake/backends/common.cmake
+++ b/cmake/backends/common.cmake
@@ -20,7 +20,13 @@ if(LITE_WITH_XPU)
   include(backends/xpu)
 endif()
 
-include(backends/x86)
+if(LITE_WITH_X86)
+  include(backends/x86)
+endif()
+
+if(LITE_WITH_LOONGARCH)
+  include(backends/loongarch)
+endif()
 
 # Add dependencies
 include(generic)                # simplify cmake module
diff --git a/cmake/backends/loongarch.cmake b/cmake/backends/loongarch.cmake
new file mode 100644
index 00000000000..559916eda36
--- /dev/null
+++ b/cmake/backends/loongarch.cmake
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT LITE_WITH_LOONGARCH)
+  RETURN()
+ENDIF()
+
+# We need C++17.
+SET(CMAKE_CXX_STANDARD 17)
+SET(CMAKE_CXX_STANDARD_REQUIRED True)
+# But some warning should be omit.
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=register")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=array-bounds")
+
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx")
+INCLUDE(external/xxhash)    # download install xxhash
+
+IF(LITE_WITH_OPENMP)
+  ADD_DEFINITIONS(-DWITH_OMP)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fopenmp")
+  SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fopenmp")
+ENDIF()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e898064a566..44258325cb9 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -113,6 +113,10 @@ if (LITE_WITH_X86)
     add_definitions("-DLITE_WITH_X86")
 endif()
 
+if (LITE_WITH_LOONGARCH)
+    add_definitions("-DLITE_WITH_LOONGARCH")
+endif()
+
 if (LITE_WITH_ARM)
     add_definitions("-DLITE_WITH_ARM")
 endif()
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 935e07ddca1..7f942b5ea84 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -138,7 +138,7 @@ ExternalProject_Add(
         ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR}
 )
 
-IF(NOT WIN32 AND NOT LITE_WITH_SW)
+IF(NOT WIN32 AND NOT LITE_WITH_SW AND NOT LITE_WITH_LOONGARCH)
     add_compile_options(-m64)
 ENDIF()
 INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 12f6f0a3df3..15aff6a3dfc 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -33,7 +33,7 @@ IF(NOT ${CBLAS_FOUND})
 
     IF (NOT WIN32)
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
-    SET(OPENBLAS_COMMIT "v0.2.20")
+    SET(OPENBLAS_COMMIT "v0.3.28")
 
     IF(APPLE)
         SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
@@ -47,7 +47,7 @@ IF(NOT ${CBLAS_FOUND})
     ExternalProject_Add(
         extern_openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_REPOSITORY      https://github.com/OpenMathLib/OpenBLAS.git
         GIT_TAG             ${OPENBLAS_COMMIT}
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 299bc359a00..cee10c1bffd 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -152,7 +152,7 @@ set(GPU_COMMON_FLAGS
     -Wno-error=array-bounds # Warnings in Eigen::array
     -gencode arch=compute_62,code=sm_62
 )
-if(LITE_WITH_SW AND NOT EMSCRIPTEN)
+if(LITE_WITH_SW AND NOT EMSCRIPTEN AND NOT LITE_WITH_LOONGARCH)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 endif()
 endif(NOT WIN32)
diff --git a/cmake/functions.cmake b/cmake/functions.cmake
index 706480624a1..8fadd8c5ec7 100644
--- a/cmake/functions.cmake
+++ b/cmake/functions.cmake
@@ -182,6 +182,10 @@ macro(lite_option variable description value)
             list(APPEND ${__varname} ${arg})
         endif()
     endforeach()
+    # If condition is null, condition shuold be FALSE
+    if("${__varname}" STREQUAL "__condition" AND __condition STREQUAL "")
+        set(__condition 1 GREATER 2)
+    endif()
     unset(__varname)
     if(__condition STREQUAL "")
         set(__condition 2 GREATER 1)
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 73a7d9814be..7930769b072 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -3,6 +3,7 @@
 
 include(CheckCXXSourceRuns)
 include(CheckCXXSourceCompiles)
+include(CheckCXXCompilerFlag)
 
 if(IOS)
 return ()
@@ -119,4 +120,12 @@ mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FO
 
 if(WITH_AVX AND AVX_FOUND)
     add_definitions(-DLITE_WITH_AVX)
-endif()
\ No newline at end of file
+endif()
+
+if(LITE_WITH_LOONGARCH)
+    check_cxx_compiler_flag(-mlasx LASX_FOUND)
+    check_cxx_compiler_flag(-mlsx LSX_FOUND)
+    if(NOT LSX_FOUND)
+        message(FATAL_ERROR "At least LSX support!")
+    endif()
+endif()
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 566f94565c3..43c5b73cb03 100755
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -2,6 +2,7 @@ include(lite)
 
 message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
+message(STATUS "LITE_WITH_LOONGARCH:\t${LITE_WITH_LOONGARCH}")
 message(STATUS "LITE_WITH_SW:\t${LITE_WITH_SW}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_METAL:\t${LITE_WITH_METAL}")
@@ -405,6 +406,26 @@ if (LITE_WITH_X86)
   endif()
 endif()
 
+if (LITE_WITH_LOONGARCH)
+  add_custom_target(publish_inference_cxx_lib ${TARGET}
+      COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+      COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+      COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+      COMMAND cp "${PADDLE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+      COMMAND cp "${PADDLE_BINARY_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+      COMMAND cp "${PADDLE_BINARY_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+      COMMAND cp "${PADDLE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+      COMMAND cp "${PADDLE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+      COMMAND cp "${PADDLE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+      )
+  add_dependencies(publish_inference_cxx_lib bundle_full_api)
+  add_dependencies(publish_inference_cxx_lib bundle_light_api)
+  add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+  add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+  add_dependencies(publish_inference publish_inference_cxx_lib)
+  add_dependencies(publish_inference test_model_bin)
+endif()
+
 if (LITE_WITH_ARM)
     if (NOT LITE_ON_TINY_PUBLISH)
         # add cxx lib
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 5dfa3dbe53f..f9ce6775a3e 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -435,6 +435,10 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
         inner_places.insert(inner_places.begin(),
                             Place{TARGET(kX86), PRECISION(kInt8)});
       }
+      if (valid_place.target == TARGET(kLoongArch)) {
+        inner_places.insert(inner_places.begin(),
+                            Place{TARGET(kLoongArch), PRECISION(kInt8)});
+      }
     }
   }
   // XPU target must make sure to insert in front of others.
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index a8279d691f3..3af10275312 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -37,6 +37,11 @@
 #endif
 #include "lite/backends/x86/mklml.h"
 #endif
+
+#ifdef WITH_OMP
+#include <omp.h>
+#endif
+
 namespace paddle {
 namespace lite {
 
@@ -149,6 +154,16 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
           << real_num_threads;
 #endif
 
+#if defined(WITH_OMP) && defined(LITE_WITH_LOONGARCH)
+  int num_threads = config.math_num_threads();
+  int max_num_threads = omp_get_max_threads();
+  int real_num_threads = std::min(num_threads, max_num_threads);
+  omp_set_num_threads(real_num_threads);
+  VLOG(3) << "math_num_threads() is set successfully and the "
+             "real number of threads is:"
+          << real_num_threads;
+#endif
+
 #ifdef LITE_WITH_XPU
   auto preferred_inputs = config.preferred_inputs_for_warmup();
   for (auto &preferred_input : preferred_inputs) {
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index 9921c5b3d43..d5ada5e529d 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -29,6 +29,10 @@
 #include "lite/backends/x86/mklml.h"
 #endif
 
+#ifdef WITH_OMP
+#include <omp.h>
+#endif
+
 namespace paddle {
 namespace lite {
 
@@ -110,6 +114,16 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
              "number of threads is:"
           << real_num_threads;
 #endif
+
+#if defined(WITH_OMP) && defined(LITE_WITH_LOONGARCH)
+  int num_threads = config.math_num_threads();
+  int max_num_threads = omp_get_max_threads();
+  int real_num_threads = std::min(num_threads, max_num_threads);
+  omp_set_num_threads(real_num_threads);
+  VLOG(3) << "math_num_threads() is set successfully and the "
+             "real number of threads is:"
+          << real_num_threads;
+#endif
 }
 
 LightPredictorImpl::~LightPredictorImpl() {
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index fbe5ddbcfc3..32d1776c5db 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -325,6 +325,9 @@ void ConfigBase::set_threads(int threads) {
   mode_ = lite::DeviceInfo::Global().mode();
   threads_ = lite::DeviceInfo::Global().threads();
 #endif
+#ifdef LITE_WITH_LOONGARCH
+  math_num_threads_ = threads >= 1 ? threads : 1;
+#endif
 }
 
 void ConfigBase::set_metal_device(void *device) {
@@ -381,6 +384,10 @@ void ConfigBase::set_x86_math_num_threads(int threads) {
 int ConfigBase::x86_math_num_threads() const { return x86_math_num_threads_; }
 #endif
 
+#ifdef LITE_WITH_LOONGARCH
+int ConfigBase::math_num_threads() const { return math_num_threads_; }
+#endif
+
 void ConfigBase::set_subgraph_model_cache_buffers(
     const std::string &key,
     const std::vector<char> &cfg,
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 21868818177..2d20541b1a8 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -236,6 +236,7 @@ class LITE_API ConfigBase {
   std::map<std::string, std::vector<char>> nnadapter_model_cache_buffers_{};
   int device_id_{0};
   int x86_math_num_threads_ = 1;
+  int math_num_threads_ = 1;
 
   std::string metal_path_;
   bool metal_use_mps_{false};
@@ -418,6 +419,7 @@ class LITE_API ConfigBase {
   // set x86_math_num_threads
   void set_x86_math_num_threads(int threads);
   int x86_math_num_threads() const;
+  int math_num_threads() const;
 
   void set_metal_lib_path(const std::string& path);
   void set_metal_use_mps(bool flag);
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index 4b9b03b6050..f03f70311ac 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -87,7 +87,8 @@ const std::string& TargetToStr(TargetType target) {
                                               "imagination_nna",
                                               "intel_fpga",
                                               "metal",
-                                              "nnadapter"};
+                                              "nnadapter",
+                                              "loongarch"};
   auto x = static_cast<int>(target);
 
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
@@ -145,7 +146,8 @@ const std::string& TargetRepr(TargetType target) {
                                               "kImaginationNNA",
                                               "kIntelFPGA",
                                               "kMetal",
-                                              "kNNAdapter"};
+                                              "kNNAdapter",
+                                              "kLoongArch"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -212,7 +214,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kImaginationNNA),
                                                TARGET(kIntelFPGA),
                                                TARGET(kMetal),
-                                               TARGET(kNNAdapter)});
+                                               TARGET(kNNAdapter),
+                                               TARGET(kLoongArch)});
   if (target == TARGET(kAny)) {
     return valid_set;
   }
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index c5757b89b94..72b646fe1c8 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <set>
 #include <string>
+#include <stdint.h>
 
 // Generic helper definitions for shared library support
 #if defined _WIN32 || defined __CYGWIN__
@@ -62,7 +63,8 @@ enum class TargetType : int {
   kIntelFPGA = 16,
   kMetal = 17,
   kNNAdapter = 18,
-  NUM = 19,  // number of fields.
+  kLoongArch = 19,
+  NUM = 20,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
diff --git a/lite/api/tools/benchmark/CMakeLists.txt b/lite/api/tools/benchmark/CMakeLists.txt
index 0f199960006..82a4ab1add4 100644
--- a/lite/api/tools/benchmark/CMakeLists.txt
+++ b/lite/api/tools/benchmark/CMakeLists.txt
@@ -51,6 +51,10 @@ set(TARGET "benchmark_bin")
 lite_cc_binary(${TARGET} SRCS ${BENCHMARK_SRC}
                DEPS gflags
                CV_DEPS paddle_cv_arm)
+IF("${CBLAS_PROVIDER}" STREQUAL "OPENBLAS")
+    TARGET_LINK_LIBRARIES(${TARGET} ${CBLAS_LIBRARIES})
+ENDIF()
+
 
 if(ARM_TARGET_OS STREQUAL "android")
     # Validation Dataset
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index 27c1c45e8ec..b7edcb01648 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(opencl)
 add_subdirectory(arm)
 add_subdirectory(x86)
+add_subdirectory(loongarch)
 add_subdirectory(host)
 add_subdirectory(xpu)
 add_subdirectory(metal)
diff --git a/lite/backends/loongarch/CMakeLists.txt b/lite/backends/loongarch/CMakeLists.txt
new file mode 100644
index 00000000000..a112da7bc73
--- /dev/null
+++ b/lite/backends/loongarch/CMakeLists.txt
@@ -0,0 +1,53 @@
+if (NOT LITE_WITH_LOONGARCH)
+  return()
+elseif(LITE_ON_MODEL_OPTIMIZE_TOOL)
+  return()
+endif ()
+
+configure_file (warpctc_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/warpctc_lib_path.h)
+
+# source code and dependencies of loongarch_math static lib
+set(LOONGARCH_MATH_SRC "" CACHE INTERNAL "")
+set(LOONGARCH_MATH_DEPS framework_proto eigen3 CACHE INTERNAL "")
+
+# source code in current directory
+FILE(GLOB LOONGARCH_BASE_SRC  ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
+# source code about jit
+FILE(GLOB LOONGARCH_JIT_SRC ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cc)
+FILE(GLOB LOONGARCH_JIT_REFER_SRC ${CMAKE_CURRENT_SOURCE_DIR}/jit/refer/*.cc)
+if (NOT WIN32 AND NOT APPLE)
+FILE(GLOB LOONGARCH_JIT_GEN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/jit/gen/*.cc)
+endif()
+set(LOONGARCH_JIT_MORE_SRC "" CACHE INTERNAL "")
+add_subdirectory(jit)
+# Fluid source file
+FILE(GLOB LOONGARCH_FLUID_SRC ${CMAKE_CURRENT_SOURCE_DIR}/fluid/*.cc)
+# detail implementation of x86 math
+FILE(GLOB LOONGARCH_DETAIL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/math/*.cc)
+FILE(GLOB LOONGARCH_DETAIL_COMMON_SRC ${CMAKE_CURRENT_SOURCE_DIR}/math/common/*.cc)
+FILE(GLOB LOONGARCH_DETAIL_LASX_SRC ${CMAKE_CURRENT_SOURCE_DIR}/math/lasx/*.cc)
+FILE(GLOB LOONGARCH_DETAIL_LSX_SRC ${CMAKE_CURRENT_SOURCE_DIR}/math/lsx/*.cc)
+
+# Step 1. collect source files
+list(APPEND LOONGARCH_MATH_SRC ${LOONGARCH_DETAIL_LSX_SRC})
+list(APPEND LOONGARCH_MATH_SRC ${LOONGARCH_BASE_SRC})
+list(APPEND LOONGARCH_MATH_SRC ${LOONGARCH_FLUID_SRC})
+list(APPEND LOONGARCH_MATH_SRC ${LOONGARCH_JIT_SRC})
+list(APPEND LOONGARCH_MATH_SRC ${LOONGARCH_JIT_REFER_SRC})
+list(APPEND LOONGARCH_MATH_SRC ${LOONGARCH_JIT_MORE_SRC})
+list(APPEND LOONGARCH_MATH_SRC ${LOONGARCH_DETAIL_SRC})
+list(APPEND LOONGARCH_MATH_SRC ${LOONGARCH_DETAIL_COMMON_SRC})
+
+# Step2. third party lib
+#  2.1 lasx or lsx
+IF(WITH_LASX)
+  set(LOONGARCH_MATH_SRC ${LOONGARCH_MATH_SRC} ${LOONGARCH_DETAIL_LASX_SRC})
+ENDIF()
+
+# Step3. Compile into a static lib libloongarch_math.a
+lite_cc_library(loongarch_math SRCS ${LOONGARCH_MATH_SRC} DEPS ${LOONGARCH_MATH_DEPS})
+add_dependencies(loongarch_math eigen3)
+
+IF(WITH_LASX)
+  TARGET_COMPILE_OPTIONS(loongarch_math PRIVATE "-mlasx")
+ENDIF()
diff --git a/lite/backends/loongarch/cpu_info.cc b/lite/backends/loongarch/cpu_info.cc
new file mode 100644
index 00000000000..5a13c4614fa
--- /dev/null
+++ b/lite/backends/loongarch/cpu_info.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/cpu_info.h"
+
+#include <unistd.h>
+
+#include <sys/auxv.h> // For getauxval
+#ifndef HWCAP_LOONGARCH_LSX
+#define HWCAP_LOONGARCH_LSX             (1 << 4)
+#endif
+#ifndef HWCAP_LOONGARCH_LASX
+#define HWCAP_LOONGARCH_LASX            (1 << 5)
+#endif
+
+#include <algorithm>
+#include "lite/utils/log/cp_logging.h"
+
+#include "lite/utils/env.h"
+
+// DEFINE_double(fraction_of_cpu_memory_to_use,
+//               1,
+//               "Default use 100% of CPU memory for PaddlePaddle,"
+//               "reserve the rest for page tables, etc");
+double fraction_of_cpu_memory_to_use =
+    paddle::lite::GetDoubleFromEnv("fraction_of_cpu_memory_to_use", 1);
+
+// DEFINE_uint64(initial_cpu_memory_in_mb,
+//               500ul,
+//               "Initial CPU memory for PaddlePaddle, in MD unit.");
+uint64_t initial_cpu_memory_in_mb =
+    paddle::lite::GetUInt64FromEnv("initial_cpu_memory_in_mb", 500ul);
+
+// If use_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to use_pinned_memory.
+// DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+bool use_pinned_memory =
+    paddle::lite::GetBoolFromEnv("use_pinned_memory", true);
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+
+size_t CpuTotalPhysicalMemory() {
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+}
+
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
+  // or the initial_cpu_memory_in_mb.
+  return std::min(static_cast<size_t>(CpuMaxAllocSize() / 32),
+                  static_cast<size_t>(initial_cpu_memory_in_mb * 1 << 20));
+}
+
+bool MayIUse(const cpu_isa_t cpu_isa) {
+  static long loong_hwcap = 0UL; // HWCAP should not be zero usually.
+  if (loong_hwcap == 0UL)
+    loong_hwcap = getauxval(AT_HWCAP);
+  switch (cpu_isa) {
+    case lsx:
+      return (loong_hwcap & HWCAP_LOONGARCH_LSX) != 0;
+    case lasx:
+      return (loong_hwcap & HWCAP_LOONGARCH_LASX) != 0;
+    case isa_any:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/cpu_info.h b/lite/backends/loongarch/cpu_info.h
new file mode 100644
index 00000000000..772a0a7bd56
--- /dev/null
+++ b/lite/backends/loongarch/cpu_info.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stddef.h>
+
+#include "lite/backends/loongarch/xxl.h"
+
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+
+size_t CpuTotalPhysicalMemory();
+
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
+
+typedef enum {
+  isa_any,
+  lsx,
+  lasx,
+} cpu_isa_t;  // Instruction set architecture
+
+// May I use some instruction
+bool MayIUse(const cpu_isa_t cpu_isa);
+
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/dynamic_loader.cc b/lite/backends/loongarch/dynamic_loader.cc
new file mode 100644
index 00000000000..c21959856e8
--- /dev/null
+++ b/lite/backends/loongarch/dynamic_loader.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "lite/backends/loongarch/dynamic_loader.h"
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+
+#include "lite/backends/loongarch/port.h"
+#include "lite/backends/loongarch/warpctc_lib_path.h"
+#include "lite/utils/env.h"
+#include "lite/utils/log/cp_logging.h"
+
+// DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+std::string f_warpctc_dir =                         // NOLINT
+    paddle::lite::GetStringFromEnv("warpctc_dir");  // NOLINT
+
+// DEFINE_string(
+//     tensorrt_dir,
+//     "",
+//     "Specify path for loading tensorrt library, such as libnvinfer.so.");
+std::string tensorrt_dir =                           // NOLINT
+    paddle::lite::GetStringFromEnv("tensorrt_dir");  // NOLINT
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+static constexpr char warpctc_lib_path[] = WARPCTC_LIB_PATH;
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
+  // directory separator
+  const char sep = '/';
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
+                                                int dynload_flags) {
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  // and /usr/local/lib path
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
+    if (nullptr == dso_handle) {
+      if (dso_path == "libcudnn.dylib") {
+        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+                        "For instance, sudo tar -xzf "
+                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+                        "chmod a+r /usr/local/cuda/include/cudnn.h "
+                        "/usr/local/cuda/lib/libcudnn*";
+      }
+    }
+  }
+#endif
+
+  if (nullptr == dso_handle) {
+    LOG(WARNING) << "Can not find library: " << dso_path
+                 << ". The process maybe hang. Please try to add the lib path "
+                    "to LD_LIBRARY_PATH.";
+  }
+  return dso_handle;
+}
+
+static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
+                                               const std::string& dso_name,
+                                               bool throw_on_error = true) {
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+  void* dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+#if !defined(_WIN32)
+    auto errorno = dlerror();
+#else
+    auto errorno = GetLastError();
+#endif  // !_WIN32
+    // if not found, search from default path
+    if (nullptr == dso_handle) {
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << errorno << ")";
+      if (dlPath.find("nccl") != std::string::npos) {
+        LOG(INFO)
+            << "You may need to install 'nccl2' from NVIDIA official website: "
+            << "https://developer.nvidia.com/nccl/nccl-download"
+            << "before install PaddlePaddle";
+      }
+      dlPath = dso_name;
+      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
+    }
+  }
+/*
+auto error_msg =
+    "Failed to find dynamic library: %s ( %s ) \n Please specify "
+    "its path correctly using following ways: \n Method. set "
+    "environment variable LD_LIBRARY_PATH on Linux or "
+    "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
+    "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+    "using the DYLD_LIBRARY_PATH is impossible unless System "
+    "Integrity Protection (SIP) is disabled.";
+*/
+#if !defined(_WIN32)
+// auto errorno = dlerror();
+#else
+  auto errorno = GetLastError();
+#endif  // !_WIN32
+  if (throw_on_error) {
+    CHECK(dso_handle != nullptr);
+    // CHECK(nullptr != dso_handle, error_msg, dlPath, errorno);
+  } else if (nullptr == dso_handle) {
+    // LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
+  }
+
+  return dso_handle;
+}
+
+void* GetWarpCTCDsoHandle() {
+  std::string warpctc_dir = warpctc_lib_path;
+  if (!f_warpctc_dir.empty()) {
+    warpctc_dir = f_warpctc_dir;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
+#else
+  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
+#endif
+}
+
+void* GetTensorRtDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.dylib");
+#else
+  return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.so");
+#endif
+}
+
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/dynamic_loader.h b/lite/backends/loongarch/dynamic_loader.h
new file mode 100644
index 00000000000..bfec142f21b
--- /dev/null
+++ b/lite/backends/loongarch/dynamic_loader.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
+
+void* GetWarpCTCDsoHandle();
+void* GetTensorRtDsoHandle();
+
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/data_type.cc b/lite/backends/loongarch/fluid/data_type.cc
new file mode 100644
index 00000000000..6a453c2bdc2
--- /dev/null
+++ b/lite/backends/loongarch/fluid/data_type.cc
@@ -0,0 +1,106 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "lite/backends/loongarch/fluid/data_type.h"
+#include <stdint.h>
+#include <map>
+#include <string>
+
+using float16 = paddle::lite::fluid::float16;
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+struct DataTypeMap {
+  std::map<std::type_index, framework::proto::VarType::Type> cpp_to_proto_;
+  std::map<int, std::type_index> proto_to_cpp_;
+  std::map<int, std::string> proto_to_str_;
+  std::map<int, size_t> proto_to_size_;
+};
+
+static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
+static DataTypeMap& gDataTypeMap() {
+  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
+  return *g_data_type_map_;
+}
+
+template <typename T>
+static inline void RegisterType(DataTypeMap* map,
+                                framework::proto::VarType::Type proto_type,
+                                const std::string& name) {
+  map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
+  map->cpp_to_proto_.emplace(typeid(T), proto_type);
+  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
+  map->proto_to_size_.emplace(static_cast<int>(proto_type), sizeof(T));
+}
+
+static DataTypeMap* InitDataTypeMap() {
+  auto retv = new DataTypeMap();
+
+#define RegType(cc_type, proto_type) \
+  RegisterType<cc_type>(retv, proto_type, #cc_type)
+
+  _ForEachDataType_(RegType);
+
+#undef RegType
+  return retv;
+}
+
+framework::proto::VarType::Type ToDataType(std::type_index type) {
+  auto it = gDataTypeMap().cpp_to_proto_.find(type);
+  if (it != gDataTypeMap().cpp_to_proto_.end()) {
+    return it->second;
+  }
+  LOG(FATAL) << "Not support " << type.name() << " as tensor type";
+  return static_cast<framework::proto::VarType::Type>(-1);
+}
+
+std::type_index ToTypeIndex(framework::proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_cpp_.end()) {
+    return it->second;
+  }
+  LOG(FATAL) << "Not support framework::proto::VarType::Type("
+             << static_cast<int>(type) << ") as tensor type";
+  return std::type_index(typeid(void));
+}
+
+std::string DataTypeToString(const framework::proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_str_.end()) {
+    return it->second;
+  }
+  LOG(FATAL) << "Not support framework::proto::VarType::Type("
+             << static_cast<int>(type) << ") as tensor type";
+  return std::string();
+}
+
+size_t SizeOfType(framework::proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_size_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_size_.end()) {
+    return it->second;
+  }
+  LOG(FATAL) << "Not support " << DataTypeToString(type).c_str()
+             << " as tensor type";
+  return 0;
+}
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/data_type.h b/lite/backends/loongarch/fluid/data_type.h
new file mode 100644
index 00000000000..6887bfcd6f7
--- /dev/null
+++ b/lite/backends/loongarch/fluid/data_type.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <typeindex>
+#include "lite/backends/loongarch/fluid/float16.h"
+#include "lite/core/framework.pb.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+template <typename T>
+struct DataTypeTrait {};
+
+// Stub handle for void
+template <>
+struct DataTypeTrait<void> {
+  constexpr static auto DataType = framework::proto::VarType::RAW;
+};
+
+#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
+  callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
+
+#define _ForEachDataType_(callback)                                        \
+  _ForEachDataTypeHelper_(callback, float, FP32);                          \
+  _ForEachDataTypeHelper_(callback, ::paddle::lite::fluid::float16, FP16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                         \
+  _ForEachDataTypeHelper_(callback, int, INT32);                           \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                       \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                           \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                       \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                       \
+  _ForEachDataTypeHelper_(callback, int8_t, INT8)
+
+#define DefineDataTypeTrait(cpp_type, proto_type) \
+  template <>                                     \
+  struct DataTypeTrait<cpp_type> {                \
+    constexpr static auto DataType = proto_type;  \
+  }
+
+_ForEachDataType_(DefineDataTypeTrait);
+
+#undef DefineDataTypeTrait
+
+extern framework::proto::VarType::Type ToDataType(std::type_index type);
+extern std::type_index ToTypeIndex(framework::proto::VarType::Type type);
+
+template <typename Visitor>
+inline void VisitDataType(framework::proto::VarType::Type type,
+                          Visitor visitor) {
+#define VisitDataTypeCallback(cpp_type, proto_type) \
+  do {                                              \
+    if (type == proto_type) {                       \
+      visitor.template apply<cpp_type>();           \
+      return;                                       \
+    }                                               \
+  } while (0)
+
+  _ForEachDataType_(VisitDataTypeCallback);
+#undef VisitDataTypeCallback
+  LOG(FATAL) << "Not supported " << type;
+}
+
+extern std::string DataTypeToString(const framework::proto::VarType::Type type);
+extern size_t SizeOfType(framework::proto::VarType::Type type);
+inline std::ostream& operator<<(std::ostream& out,
+                                const framework::proto::VarType::Type& type) {
+  out << DataTypeToString(type);
+  return out;
+}
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/eigen.h b/lite/backends/loongarch/fluid/eigen.h
new file mode 100644
index 00000000000..ffa21559a20
--- /dev/null
+++ b/lite/backends/loongarch/fluid/eigen.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/fluid/float16.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/log/cp_logging.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const lite::DDim& dims) {
+    CHECK_EQ(dims.size(), D) << "D must match DDim::size";
+    Type ret;
+    for (size_t d = 0; d < dims.size(); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+
+  static Type From(const DDim::value_type length) {
+    CHECK_EQ(D, 1) << "D must be 1.";
+    Type ret;
+    ret[0] = length;
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor, const lite::DDim& dims) {  // NOLINT
+    return Type(const_cast<T*>(tensor.data<T>()),
+                EigenDim<D>::From(dims));  // NOLINT
+  }
+
+  static Type From(Tensor& tensor) {  // NOLINT
+    return From(tensor, tensor.dims());
+  }  // NOLINT
+
+  static ConstType From(const Tensor& tensor, const lite::DDim& dims) {
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const Tensor& tensor) {
+    return From(tensor, tensor.dims());
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
+  static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
+                                            int num_col_dims) {
+    int rank = tensor.dims().size();
+    CHECK(num_col_dims > 0 && num_col_dims < rank)
+        << "`num_col_dims` must be between (0, rank_of_tensor).";
+    return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
+  }
+
+  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
+                                                 int num_col_dims) {
+    int rank = tensor.dims().size();
+    CHECK(num_col_dims > 0 && num_col_dims < rank)
+        << "`num_col_dims` must be between (0, rank_of_tensor).";
+    return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(Tensor& tensor) {  // NOLINT
+    return typename EigenVector::Type(
+        const_cast<T*>(tensor.data<T>()),
+        EigenDim<1>::From(tensor.dims().production()));
+  }
+
+  static typename EigenVector::ConstType Flatten(
+      const Tensor& tensor) {  // NOLINT
+    return typename EigenVector::ConstType(
+        tensor.data<T>(), EigenDim<1>::From(tensor.dims().production()));
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(Tensor* tensor) {
+    return Type(const_cast<T*>(tensor->data<T>()));
+  }  // NOLINT
+
+  static ConstType From(const Tensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
+template <lite::TargetType Target>
+struct EigenDevice;
+
+template <>
+struct EigenDevice<lite::TargetType::kLoongArch> {
+  using Type = ::Eigen::DefaultDevice;
+};
+
+template <lite::TargetType Target>
+using EigenDeviceType = typename EigenDevice<Target>::Type;
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/float16.h b/lite/backends/loongarch/fluid/float16.h
new file mode 100644
index 00000000000..86bf0c24c36
--- /dev/null
+++ b/lite/backends/loongarch/fluid/float16.h
@@ -0,0 +1,802 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <limits>
+
+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif  // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif  // __clang__
+
+#if defined(__arm__) || defined(__aarch64__)
+#define PADDLE_ARM
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#define PADDLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
+#endif
+
+#ifndef PADDLE_ARM
+#include "lite/backends/loongarch/xxl.h"
+#endif  // PADDLE_ARM
+
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+// Forward declare float16 for eigen.h
+struct float16;
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
+
+#include "lite/utils/macros.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with ARM float16_t, and Eigen::half data types.
+struct PADDLE_ALIGN(2) float16 {
+ public:
+  uint16_t x;
+
+  // The following defaulted special class member functions
+  // are added to make float16 pass the std::is_trivial test
+  float16() = default;
+  float16(const float16& o) = default;
+  float16& operator=(const float16& o) = default;
+  float16(float16&& o) = default;
+  float16& operator=(float16&& o) = default;
+  ~float16() = default;
+
+  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  // __fp16 is a native half precision data type for arm cpu,
+  // float16_t is an alias for __fp16
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
+  }
+#endif
+
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_WITH_NATIVE_FP16)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
+
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator Eigen::half() const {
+    Eigen::half h;
+    h.x = x;
+    return h;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
+  }
+#endif
+
+  HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_WITH_NATIVE_FP16)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(static_cast<float>(*this));
+  }
+
+ private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+};
+
+// Arithmetic operators for float16 on GPU
+#if defined(PADDLE_WITH_NATIVE_FP16)
+inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+
+inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
+
+inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+// Arithmetic operators for float16, software emulated on other CPU
+#else
+inline float16 operator+(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) + static_cast<float>(b));
+}
+
+inline float16 operator-(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) - static_cast<float>(b));
+}
+
+inline float16 operator*(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) * static_cast<float>(b));
+}
+
+inline float16 operator/(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) + static_cast<float>(b));
+  return a;
+}
+
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) - static_cast<float>(b));
+  return a;
+}
+
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) * static_cast<float>(b));
+  return a;
+}
+
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) / static_cast<float>(b));
+  return a;
+}
+
+inline bool operator==(const float16& a, const float16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+
+inline bool operator!=(const float16& a, const float16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+
+inline bool operator<(const float16& a, const float16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+inline bool operator<=(const float16& a, const float16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+
+inline bool operator>(const float16& a, const float16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+
+inline bool operator>=(const float16& a, const float16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+#endif
+
+HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
+  float16 res;
+  res.x = a;
+  return res;
+}
+
+HOSTDEVICE inline bool(isnan)(const float16& a) {
+  return (a.x & 0x7fff) > 0x7c00;
+}
+
+HOSTDEVICE inline bool(isinf)(const float16& a) {
+  return (a.x & 0x7fff) == 0x7c00;
+}
+
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const float16& a) {
+  os << static_cast<float>(a);
+  return os;
+}
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
+
+namespace std {
+
+// Override the std::is_pod::value for float16
+// The reason is that different compilers implemented std::is_pod based on
+// different C++ standards. float16 class is a plain old data in C++11 given
+// that it is both trivial and standard_layout.
+// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
+// more restricted in that you cannot provide any customized
+// constructor in float16. Hence, we override is_pod here following C++11
+// so that .cu files can be successfully compiled by nvcc.
+template <>
+struct is_pod<paddle::lite::fluid::float16> {
+  static const bool value =
+      is_trivial<paddle::lite::fluid::float16>::value &&
+      is_standard_layout<paddle::lite::fluid::float16>::value;
+};
+
+template <>
+struct is_floating_point<paddle::lite::fluid::float16>
+    : std::integral_constant<
+          bool,
+          std::is_same<paddle::lite::fluid::float16,
+                       typename std::remove_cv<
+                           paddle::lite::fluid::float16>::type>::value> {};
+template <>
+struct is_signed<paddle::lite::fluid::float16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::lite::fluid::float16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::lite::fluid::float16& a) {
+  return paddle::lite::fluid::isnan(a);
+}
+
+inline bool isinf(const paddle::lite::fluid::float16& a) {
+  return paddle::lite::fluid::isinf(a);
+}
+
+template <>
+struct numeric_limits<paddle::lite::fluid::float16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;
+  static const int max_digits10 = 5;
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static paddle::lite::fluid::float16(min)() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x400);
+  }
+  static paddle::lite::fluid::float16 lowest() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0xfbff);
+  }
+  static paddle::lite::fluid::float16(max)() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x7bff);
+  }
+  static paddle::lite::fluid::float16 epsilon() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x0800);
+  }
+  static paddle::lite::fluid::float16 round_error() {
+    return paddle::lite::fluid::float16(0.5);
+  }
+  static paddle::lite::fluid::float16 infinity() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x7c00);
+  }
+  static paddle::lite::fluid::float16 quiet_NaN() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::lite::fluid::float16 signaling_NaN() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::lite::fluid::float16 denorm_min() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x1);
+  }
+};
+
+}  // namespace std
+
+namespace Eigen {
+
+using float16 = paddle::lite::fluid::float16;
+
+template <>
+struct NumTraits<float16> : GenericNumTraits<float16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline float16 epsilon() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x0800);
+  }
+  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
+  HOSTDEVICE static inline float16 highest() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x7bff);
+  }
+  HOSTDEVICE static inline float16 lowest() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0xfbff);
+  }
+  HOSTDEVICE static inline float16 infinity() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x7c00);
+  }
+  HOSTDEVICE static inline float16 quiet_NaN() {
+    return paddle::lite::fluid::raw_uint16_to_float16(0x7c01);
+  }
+};
+
+namespace numext {
+
+template <>
+HOSTDEVICE inline bool(isnan)(const float16& a) {
+  return (paddle::lite::fluid::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const float16& a) {
+  return (paddle::lite::fluid::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
+  return (paddle::lite::fluid::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline float16 exp(const float16& a) {
+  return float16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 erf(const float16& a) {
+  return float16(::erff(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 log(const float16& a) {
+  return float16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 tanh(const float16& a) {
+  return float16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 sqrt(const float16& a) {
+  return float16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 ceil(const float16& a) {
+  return float16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 floor(const float16& a) {
+  return float16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 round(const float16& a) {
+  return float16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
+  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline float16 abs(const float16& a) {
+  return float16(::fabs(static_cast<float>(a)));
+}
+
+}  // namespace numext
+
+}  // namespace Eigen
diff --git a/lite/backends/loongarch/fluid/for_range.h b/lite/backends/loongarch/fluid/for_range.h
new file mode 100644
index 00000000000..32337305641
--- /dev/null
+++ b/lite/backends/loongarch/fluid/for_range.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/context.h"
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+template <lite::TargetType Target>
+struct ForRange {
+  ForRange(const lite::Context<Target>& dev_ctx, size_t limit)
+      : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/hostdevice.h b/lite/backends/loongarch/fluid/hostdevice.h
new file mode 100644
index 00000000000..c297d19e93a
--- /dev/null
+++ b/lite/backends/loongarch/fluid/hostdevice.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
diff --git a/lite/backends/loongarch/fluid/lod.h b/lite/backends/loongarch/fluid/lod.h
new file mode 100644
index 00000000000..56052d15c51
--- /dev/null
+++ b/lite/backends/loongarch/fluid/lod.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+using LoD = std::vector<std::vector<uint64_t>>;
+
+static LoD ToAbsOffset(const LoD &in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
+    }
+  }
+  return result;
+}
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/math.h b/lite/backends/loongarch/fluid/math.h
new file mode 100644
index 00000000000..8cc24200d37
--- /dev/null
+++ b/lite/backends/loongarch/fluid/math.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+#include "math.h"  // NOLINT
+
+namespace paddle {
+namespace operators {
+
+inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
+  return static_cast<platform::float16>(::expf(static_cast<float>(x)));
+}
+
+inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
+
+inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
+
+inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
+  return static_cast<platform::float16>(::logf(static_cast<float>(x)));
+}
+
+inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
+
+inline HOSTDEVICE double real_log(double x) { return ::log(x); }
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/rw_lock.h b/lite/backends/loongarch/fluid/rw_lock.h
new file mode 100644
index 00000000000..cf1442add34
--- /dev/null
+++ b/lite/backends/loongarch/fluid/rw_lock.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if !defined(_WIN32)
+#include <pthread.h>
+#else
+#include <mutex>  // NOLINT
+#endif            // !_WIN32
+
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+#if !defined(_WIN32)
+struct RWLock {
+  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+  ~RWLock() { pthread_rwlock_destroy(&lock_); }
+
+  inline void RDLock() {
+    CHECK_EQ(pthread_rwlock_rdlock(&lock_), 0) << "acquire read lock failed";
+  }
+
+  inline void WRLock() {
+    CHECK_EQ(pthread_rwlock_wrlock(&lock_), 0) << "acquire write lock failed";
+  }
+
+  inline void UNLock() {
+    CHECK_EQ(pthread_rwlock_unlock(&lock_), 0) << "unlock failed";
+  }
+
+ private:
+  pthread_rwlock_t lock_;
+};
+// TODO(paddle-dev): Support RWLock for WIN32 for correctness.
+#else
+// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
+// In windows, rw_lock seems like a hack. Use empty object and do nothing.
+struct RWLock {
+  // FIXME(minqiyang): use mutex here to do fake lock
+  inline void RDLock() { mutex_.lock(); }
+
+  inline void WRLock() { mutex_.lock(); }
+
+  inline void UNLock() { mutex_.unlock(); }
+
+ private:
+  std::mutex mutex_;
+};
+#endif
+
+class AutoWRLock {
+ public:
+  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+
+  ~AutoWRLock() { UnLock(); }
+
+ private:
+  inline void Lock() { lock_->WRLock(); }
+
+  inline void UnLock() { lock_->UNLock(); }
+
+ private:
+  RWLock* lock_;
+};
+
+class AutoRDLock {
+ public:
+  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+
+  ~AutoRDLock() { UnLock(); }
+
+ private:
+  inline void Lock() { lock_->RDLock(); }
+
+  inline void UnLock() { lock_->UNLock(); }
+
+ private:
+  RWLock* lock_;
+};
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/selected_rows.cc b/lite/backends/loongarch/fluid/selected_rows.cc
new file mode 100644
index 00000000000..7ab0515755c
--- /dev/null
+++ b/lite/backends/loongarch/fluid/selected_rows.cc
@@ -0,0 +1,243 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/fluid/selected_rows.h"
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+struct ReAllocateVisitor {
+  ReAllocateVisitor(const lite::DDim& dims, lite::Tensor* tensor)
+      : dims_(dims), tensor_(tensor) {}
+
+  template <typename T>
+  void operator()() const {
+    lite::Tensor cpu_tensor;
+    T* ptr = cpu_tensor.mutable_data<T>(lite::TargetType::kLoongArch, dims_);
+    const T* old_ptr =
+        tensor_->memory_size() == 0 ? nullptr : tensor_->mutable_data<T>();
+    if (old_ptr != nullptr) {
+      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
+    }
+    tensor_->ShareDataWith(cpu_tensor);
+  }
+
+  lite::DDim dims_;
+  lite::Tensor* tensor_;
+};
+
+struct TensorCopyVisitor {
+  TensorCopyVisitor(lite::Tensor* dst,
+                    int64_t dst_offset,
+                    const lite::Tensor src,
+                    int64_t src_offset,
+                    int64_t size)
+      : dst_(dst),
+        dst_offset_(dst_offset),
+        src_(src),
+        src_offset_(src_offset),
+        size_(size) {}
+
+  template <typename T>
+  void apply() const {
+    // TODO(Yancey1989): support other place
+    std::copy_n(src_.data<T>() + src_offset_,
+                size_,
+                dst_->mutable_data<T>(lite::TargetType::kLoongArch) + dst_offset_);
+  }
+
+  lite::Tensor* dst_;
+  int64_t dst_offset_;
+  lite::Tensor src_;
+  int64_t src_offset_;
+  int64_t size_;
+};
+
+struct TensorFillVisitor {
+  TensorFillVisitor(lite::Tensor* dst,
+                    int64_t dst_offset,
+                    int64_t size,
+                    float value)
+      : dst_(dst), dst_offset_(dst_offset), size_(size) {}
+
+  template <typename T>
+  void apply() const {
+    // TODO(qiao): support other place
+    //    paddle::platform::CPUPlace cpu;
+    auto* tensor_data = dst_->mutable_data<T>(lite::TargetType::kLoongArch);
+    auto* start = tensor_data + dst_offset_;
+    auto* end = start + size_;
+    std::fill(start, end, static_cast<T>(0.0));
+  }
+
+  lite::Tensor* dst_;
+  int64_t dst_offset_;
+  int64_t size_;
+};
+
+void SerializeToStream(std::ostream& os,
+                       const SelectedRows& selected_rows,
+                       const lite::Context<lite::TargetType::kLoongArch>& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, rows information
+    auto& rows = selected_rows.rows();
+    uint64_t size = rows.size();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    for (uint64_t i = 0; i < size; ++i) {
+      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
+    }
+  }
+  {
+    // the 3st field, the height of SelectedRows
+    int64_t height = selected_rows.height();
+    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
+  }
+  // the 4st field, Tensor data
+  TensorToStream(os, selected_rows.value());
+}
+
+void DeserializeFromStream(
+    std::istream& is,
+    SelectedRows* selected_rows,
+    const lite::Context<lite::TargetType::kLoongArch>& dev_ctx) {
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char*>(&version), sizeof(version));
+    CHECK_EQ(version, 0U) << "Only version 0 is supported";
+  }
+  {
+    // the 2st field, rows information
+    uint64_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    auto& rows = *selected_rows->mutable_rows();
+    rows.resize(size);
+    for (uint64_t i = 0; i < size; ++i) {
+      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
+    }
+  }
+  {
+    // the 3st field, the height of the SelectedRows
+    int64_t height;
+    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
+    selected_rows->set_height(height);
+  }
+  // the 4st field, tensor which contains the data
+  TensorFromStream(is, selected_rows->mutable_value());
+}
+
+bool SelectedRows::HasKey(int64_t key) const {
+  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
+                                                                   : true;
+}
+
+int64_t SelectedRows::AutoGrownIndex(int64_t key,
+                                     bool auto_grown,
+                                     bool is_test) {
+  if (is_test) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
+
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      LOG(FATAL) << "key " << key << " not found";
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      LOG(FATAL) << "id_to_index_ size " << map_size
+                 << " should have the same size with rows_ " << vector_size;
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      int row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        LOG(FATAL) << "selected rows is full, then length exceed " << row_num;
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+
+void SelectedRows::SyncIndex() {
+  rwlock_->WRLock();
+  id_to_index_.clear();
+  for (size_t i = 0; i < rows_.size(); ++i) {
+    id_to_index_[rows_[i]] = i;
+  }
+  rwlock_->UNLock();
+}
+
+void SelectedRows::Get(const lite::Tensor& ids,
+                       lite::Tensor* value,
+                       bool auto_grown,
+                       bool is_test) {
+  CHECK(value->IsInitialized()) << "The value tensor should be initialized.";
+  if (ids.numel() == 0) {
+    VLOG(3) << "keys is empty, please check data!";
+  } else {
+    int64_t value_width = value_->numel() / value_->dims()[0];
+    CHECK_EQ(value_width, value->numel() / value->dims()[0])
+        << "output tensor should have the same shape with table "
+           "except the dims[0].";
+    for (int i = 0; i < ids.numel(); ++i) {
+      auto id = ids.data<int64_t>()[i];
+      int64_t index = AutoGrownIndex(id, auto_grown, is_test);
+      if (index < 0) {
+        VLOG(5) << "id " << id << " not in the table, return 0";
+        TensorFillVisitor(value, i * value_width, value_width, 0.0)
+            .apply<float>();
+      } else {
+        TensorCopyVisitor(value,
+                          i * value_width,
+                          *value_.get(),
+                          index * value_width,
+                          value_width)
+            .apply<float>();
+      }
+    }
+  }
+}
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/selected_rows.h b/lite/backends/loongarch/fluid/selected_rows.h
new file mode 100644
index 00000000000..ec679e8f772
--- /dev/null
+++ b/lite/backends/loongarch/fluid/selected_rows.h
@@ -0,0 +1,173 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "lite/backends/loongarch/fluid/rw_lock.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/model_parser/model_parser.h"
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+class SelectedRows {
+  /*
+   * @brief We can use the SelectedRows structure to reproduce a sparse table.
+   *  A sparse table is a key-value structure that the key is an `int64_t`,
+   *  and the value is a Tensor which the first dimension is 0.
+   *  You can use the following interface to operate the sparse table, and you
+   * can find
+   *  some detail information from the comments of each interface:
+   *
+   *  HasKey(key), whether the sparse table has the specified key.
+   *  Set(key, value), set a key-value pair into the sparse table.
+   *  Get(keys, value*), get value by given key list and apply it to the given
+   * value pointer
+   *    with the specified offset.
+   *
+   */
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+    rwlock_.reset(new RWLock);
+  }
+
+  SelectedRows() {
+    height_ = 0;
+    value_.reset(new Tensor());
+    rwlock_.reset(new RWLock);
+  }
+
+  TargetType target() const { return value_->target(); }
+
+  const Tensor& value() const { return *value_; }
+
+  Tensor* mutable_value() { return value_.get(); }
+
+  int64_t height() const { return height_; }
+
+  void set_height(int64_t height) { height_ = height; }
+
+  const std::vector<int64_t>& rows() const { return rows_; }
+
+  std::vector<int64_t>* mutable_rows() { return &rows_; }
+
+  void set_rows(const std::vector<int64_t>& rows) { rows_ = rows; }
+
+  /*
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      LOG(FATAL) << "id " << key << " not in table";
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
+  /*
+   * @brief whether has the specified key in the table.
+   *
+   * @return true if the key is exists.
+   */
+  bool HasKey(int64_t key) const;
+
+  /*
+   * @brief Get value by the key list.
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
+   *
+   * @return a list of pair which contains the non-exists key and the index in
+   * the value
+   */
+  void Get(const lite::Tensor& ids,
+           lite::Tensor* value,
+           bool auto_grown = false,
+           bool is_test = false);
+
+  /*
+   * @brief Get the index of the key from id_to_index_ map. If the key not
+   * exist,
+   * add the key into id_to_index_.
+   *
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
+   *
+   * @return index of the key.
+   */
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
+
+  /*
+   * @brief Get the index of the key from id_to_index_ map.
+   */
+  inline int64_t GetIndexFromId(int64_t key) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
+
+  void SyncIndex();
+  /*
+   * @brief Get complete Dims before
+   */
+  DDim GetCompleteDims() const {
+    DDim dims = value_->dims();
+    dims[0] = height_;
+    return dims;
+  }
+
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simply concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  std::vector<int64_t> rows_;
+  std::map<int64_t, int64_t>
+      id_to_index_;  // should not be used when rows_ has duplicate member
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;  // height indicates the underline tensor's height
+  std::unique_ptr<RWLock> rwlock_{nullptr};
+};
+
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os,
+                       const SelectedRows& selected_rows,
+                       const lite::Context<lite::TargetType::kLoongArch>& dev_ctx);
+void DeserializeFromStream(
+    std::istream& is,
+    SelectedRows* selected_rows,
+    const lite::Context<lite::TargetType::kLoongArch>& dev_ctx);
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/fluid/transform.h b/lite/backends/loongarch/fluid/transform.h
new file mode 100644
index 00000000000..c133ee1042a
--- /dev/null
+++ b/lite/backends/loongarch/fluid/transform.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <type_traits>
+
+#include "lite/backends/loongarch/fluid/hostdevice.h"
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+// Transform applys a unary or a binary functor on each element in a
+// range defined by a pair of iterators.
+//
+// - The specialization for CPU calls std::transform.
+//
+// NOTE: We need to define InputIter and OutputIter defined as
+//       different types, because the InputIter points op's inputs and
+//       OutputIter pints to op's outputs.
+//
+// NOTE: We don't assume that InputIter to be const InputType* and
+//       OutputIter to be OutputType*, because we might use a iterator
+//       class, paddle::fluid::operators::RowwiseTRansformIterator.
+template <lite::TargetType Target>
+struct Transform {
+  // The unary version.
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const lite::Context<Target>& context,
+                  InputIter first,
+                  InputIter last,
+                  OutputIter result,
+                  UnaryOperation op);
+
+  // The binary version.
+  template <typename InputIter1,
+            typename InputIter2,
+            typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const lite::Context<Target>& context,
+                  InputIter1 first1,
+                  InputIter1 last1,
+                  InputIter2 first2,
+                  OutputIter result,
+                  BinaryOperation op);
+};
+
+template <>
+struct Transform<lite::TargetType::kLoongArch> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const lite::LoongArchContext& context,
+                  InputIter first,
+                  InputIter last,
+                  OutputIter result,
+                  UnaryOperation op) {
+    std::transform(first, last, result, op);
+  }
+
+  template <typename InputIter1,
+            typename InputIter2,
+            typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const lite::LoongArchContext& context,
+                  InputIter1 first1,
+                  InputIter1 last1,
+                  InputIter2 first2,
+                  OutputIter result,
+                  BinaryOperation op) {
+    std::transform(first1, last1, first2, result, op);
+  }
+};
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/CMakeLists.txt b/lite/backends/loongarch/jit/CMakeLists.txt
new file mode 100644
index 00000000000..1d757f08a3d
--- /dev/null
+++ b/lite/backends/loongarch/jit/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(jit_file ${PADDLE_BINARY_DIR}/lite/backends/loongarch/jit/kernels.h)
+file(WRITE ${jit_file} "// Generated by the lite/backends/loongarch/jit/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${jit_file} "\#pragma once\n")
+file(APPEND ${jit_file} "\#include \"lite/backends/loongarch/jit/helper.h\"\n")
+file(APPEND ${jit_file} "\#include \"lite/backends/loongarch/jit/registry.h\"\n\n")
+
+set(LOONGARCH_MATH_DEPS  ${LOONGARCH_MATH_DEPS} cblas gflags xxhash CACHE INTERNAL "")
+
+# refer must go first
+add_subdirectory(refer)
+add_subdirectory(more)
diff --git a/lite/backends/loongarch/jit/README.en.md b/lite/backends/loongarch/jit/README.en.md
new file mode 100644
index 00000000000..dc9eb4cf239
--- /dev/null
+++ b/lite/backends/loongarch/jit/README.en.md
@@ -0,0 +1,103 @@
+# JIT Kernel
+
+JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
+Each implementation has its own condition to use, defined in `CanBeUsed`.
+They are combined together to get the best performance of one single independent function.
+They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
+And they can be composed with some other exited jit kernels to build up a complex function. 
+Currently it's only supported on CPU yet.
+
+## Contents
+
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── lite/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+
+All basical definations of jit kernels are addressed in `lite/backends/x86/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality.
+
+- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries.
+- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak.
+- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. 
+
+## How to use
+
+We present these methods to get the functions:
+- `GetAllCandidateFuncs`. It can return all the implementations supported. All of the implementations can get the same result. You can do some runtime benchmark to choose which should actually be used.
+- `GetDefaultBestFunc`. It only return one default function pointer, which is tuning offline with some genenal configures and attributes. This should cover most situations.
+- `KernelFuncs::Cache()`. It can get the default functions and save it for next time with the same attribute. 
+- `GetReferFunc`. It can only get the reference code in CPU, and all the others implementations have same logic with this reference code.
+
+And here are some examples:
+
+Get from cache:
+
+```cpp
+    using T = float;
+    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
+    auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
+    seqpool_func(src_data, dst_data, &attr);
+```
+
+Get all implementations and run once:
+
+```cpp
+    using T = float;
+    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
+    auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
+    for (auto f : funcs) {
+        LOG(INFO) << "Kernel implementation type: " << f.first;
+        f.second(src_data, dst_data, &attr);
+    }
+```
+
+All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automatically generated in compile time, you can only include this one header to get all the registered kernels.
+
+## Solid Test
+
+- Unit Test
+    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
+- Benchmark
+    All functions should be tested, and make sure the `jit::GetDefaultBestFunc` function obtain the best performance with all attributes.
+
+# How to add new kernel
+
+## Required
+
+1. Add `your_key` at `KernelType`.
+2. Add your new `KernelTuple` which must include `your_key`. It should be a combination of the data type, attribute type and function type. You can refer `SeqPoolTuple`.
+3. Add reference function of `your_key`. 
+Note:
+    - this should be run on CPU and do not depend on any third-party.
+    - Add `USE_JITKERNEL_REFER_LITE(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
+Test more data type for some special functions if necessary, for example `int8`.
+5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.
+
+## Optional
+
+Add more implementations of `your_kery` for performance enhancement.
+
+1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have correpsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
+2. If new attribute type is added, you should specialize `JitCodeKey` of this type.
+3. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
diff --git a/lite/backends/loongarch/jit/README.md b/lite/backends/loongarch/jit/README.md
new file mode 100644
index 00000000000..bc0e27234d0
--- /dev/null
+++ b/lite/backends/loongarch/jit/README.md
@@ -0,0 +1,94 @@
+# JIT Kernel
+
+结合函数模板和JIT生成需要的kernel函数。
+这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。可以有多重第三方库的实现，每种实现有自己的`CanBeUsed`函数负责什么条件下可以被调用。
+这里实现的函数可以非常细粒度的函数方法，比如Vector MUL， 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。
+目前仅支持CPU上的高性能计算。
+
+## 目录结构
+
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── lite/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+
+基本类的定义都放在根目录下，根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现，每种kernel算子都需要有reference的实现，用作单元测试的基准，其他的实现都是可选的。
+- gen: 代表使用jit生成的code，需要依赖xbyak库。该实现最关心的就是性能。
+- refer: 代表reference的实现，每种kernel算子都需要有在CPU上的reference的实现，他主要关心的算法逻辑的正确性。
+- more: 下面可以放入跟多实现，可以包括mkl，mkldnn，intrinsic，openblas等，也可以是自身已有的kernel组合。
+
+## 动态获取
+
+- 提供`GetAllCandidateFuncs`方法，根据输入的kernel类别，获取满足要求的所有函数实现。所有实现保证结果一致，但是速度不一致，可以根据具体输入属性大小，动态测试得到当前最优实现，手动选择最优函数。
+- 提供`GetDefaultBestFunc`方法，返回一个默认最优的函数实现。该函数是根据一些通用配置离线tuning之后的结果，能覆盖大多数情况下最优结果。
+- 提供`KernelFuncs::Cache()`方法，该方法会返回默认最优的函数，同时会缓存该函数指针，如果出现属性一致的情况，直接返回上次的函数指针，如果不存在则根据属性新建。
+- 提供`GetReferFunc` 方法，返回该kernel最原始的逻辑函数。该方法与kernel的输入大小和属性没有任何关系，有且并只有一个在CPU上的实现。该方法表征了kernel的原始逻辑，其他所有实现的逻辑与它保持一致。
+
+### 例子
+
+所有kernel的调用只需要在头文件中包含`"lite/backends/x86/jit/kernels.h"`， 该文件是编译时自动生成的。
+
+直接从缓存中获取默认最优的函数。
+
+```cpp
+    using T = float;
+    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
+    auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
+    seqpool_func(src_data, dst_data, &attr);
+```
+
+跑一遍所有实现，并输出实现类别。
+
+```cpp
+    using T = float;
+    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
+    auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
+    for (auto f : funcs) {
+        LOG(INFO) << "Kernel implementation type: " << f.first;
+        f.second(src_data, dst_data, &attr);
+    }
+```
+
+## 测试
+
+- 逻辑测试
+    所有实现都要与refer的code对比，需要满足精度要求， 包括float和double的数据类型
+- 性能测试
+    所有实现的性能对比，并且与最终的`jit::GetDefaultBestFunc`方法对比，该方法拿到的性能需要在各种条件下都是最好的。
+
+# 如何添加新的算子
+
+1. 在`KernelType` 中添加 `your_key` 。
+2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER_LITE(your_key)`来使用该kernel。
+3. (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
+4. (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
+5. 添加新的`KernelTuple`，需要与`KernelType`一一对应，是所有类型的一个打包，包括数据类型，属性的类型，以及返回的函数类型。可以参考`SeqPoolTuple`，新加的Attr类型需要特例化`JitCodeKey`方法。
+6. 在`test.cc`中添加unit test，至少需要测试`float`和`double`两种数据类型，如有必要需要支持额外的数据类型，比如`int8`的相关函数。
+7. 在`benchmark.cc`中添加相应的性能对比，同一种kernel需要对比所有实现，并且确保`GetDefaultBestFunc`得到的实现一直是速度最快的。
+
+# 优点
+- 接口方便，灵活调用。
+- 同一套逻辑可以有多套实现，可以依赖多套第三方库，互不影响。
+- 目录结构清晰，不会在某个文件中有多个宏定义，导致的可读性差问题。
+- 优化方便，可以直接针对某种属性针对性优化，并不影响其他属性下的性能。
+- 可以支持多种平台，包括Linux，Mac 和 Windows，至少可以保证每种平台都可以正常work。后期也可以针对不同平台有针对的优化。框架层面可以使用统一接口，不必关心底层实现。
diff --git a/lite/backends/loongarch/jit/gen_base.cc b/lite/backends/loongarch/jit/gen_base.cc
new file mode 100644
index 00000000000..8c43cc024f0
--- /dev/null
+++ b/lite/backends/loongarch/jit/gen_base.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "lite/backends/loongarch/jit/gen_base.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <vector>
+// #include "paddle/fluid/memory/allocation/cpu_allocator.h"  // for
+// posix_memalign
+#include "lite/backends/loongarch/cpu_info.h"
+#include "lite/backends/loongarch/jit/macro.h"
+#include "lite/utils/env.h"
+#include "lite/utils/log/cp_logging.h"
+
+#ifndef _WIN32
+#define posix_memalign_free free
+#endif
+
+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
+
+// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
+bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+// refer do not need CanBeUsed, it would be the last one.
+void GenBase::dumpCode(const unsigned char* code) const {
+  if (code) {
+    static int counter = 0;
+    std::ostringstream filename;
+    filename << "paddle_jitcode_" << name() << "." << counter << ".bin";
+    counter++;
+    std::ofstream fout(filename.str(), std::ios::out);
+    if (fout.is_open()) {
+      fout.write(reinterpret_cast<const char*>(code), this->getSize());
+      fout.close();
+    }
+  }
+}
+
+void* GenBase::operator new(size_t size) {
+  void* ptr;
+  constexpr size_t alignment = 32ul;
+  CHECK_EQ(posix_memalign(&ptr, alignment, size), 0) << "GenBase Alloc " << size
+                                                     << " error!";
+  CHECK(ptr) << "Fail to allocate GenBase CPU memory: size = " << size;
+  return ptr;
+}
+
+void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
+
+std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
+  int block;
+  int max_num_regs;
+  if (loongarch::MayIUse(loongarch::lasx)) {
+    block = LASX_FLOAT_BLOCK;
+    max_num_regs = 32;
+  } else {
+    block = LSX_FLOAT_BLOCK;
+    max_num_regs = 16;
+  }
+  // one for x, one for y, others for z
+  const int max_used_regs_for_n = max_num_regs - 2;
+  const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
+  const int num_block = aligned_n / block;
+  const int num_groups = num_block / max_used_regs_for_n;
+  std::vector<int> groups(num_groups, max_used_regs_for_n);
+  int rest_num_regs = num_block % max_used_regs_for_n;
+  if (rest_num_regs != 0) {
+    groups.push_back(rest_num_regs);
+  }
+  if (block_out) {
+    *block_out = block;
+  }
+  if (rest_out) {
+    *rest_out = n % block;
+  }
+  return groups;
+}
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/gen_base.h b/lite/backends/loongarch/jit/gen_base.h
new file mode 100644
index 00000000000..4601c3962d5
--- /dev/null
+++ b/lite/backends/loongarch/jit/gen_base.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include <memory>  // for unique_ptr
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/jit/kernel_base.h"
+
+// DECLARE_bool(dump_jitcode);
+extern bool dump_jitcode;
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+class GenBase : public Kernel {
+ public:
+  virtual ~GenBase() = default;
+  virtual std::string name() const = 0;
+  virtual size_t getSize() const = 0;
+  virtual const unsigned char* getCodeInternal() const = 0;
+  const char* ImplType() const override { return "JitCode"; }
+  template <typename Func>
+  Func getCode() const {
+    const unsigned char* code = this->getCodeInternal();
+    if (dump_jitcode) {
+      this->dumpCode(code);
+    }
+    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
+    // then workaround with const_cast. Any better idea is appreciated.
+    return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
+  }
+
+  void* operator new(size_t size);
+  void operator delete(void* ptr);
+  void* operator new[](size_t size) { return operator new(size); }
+  void operator delete[](void* ptr) { operator delete(ptr); }
+
+ protected:
+  void dumpCode(const unsigned char* code) const;
+};
+
+// Creator is used to creat the jitcode and save in pool.
+// Every JitCode should have one creator.
+class GenCreator {
+ public:
+  virtual ~GenCreator() = default;
+};
+
+template <typename Attr>
+class JitCodeCreator : public GenCreator {
+ public:
+  virtual ~JitCodeCreator() = default;
+
+  // condition when this jit code can be used.
+  virtual bool CanBeUsed(const Attr& attr) const = 0;
+
+  // estimate this code size
+  virtual size_t CodeSize(const Attr& attr) const = 0;
+
+  // create this code
+  virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
+};
+
+// unify the method of packed groups
+// output the packed groups which used in weights, the block size and rest size
+std::vector<int> packed_groups(int n,
+                               int k,
+                               int* block = nullptr,
+                               int* rest = nullptr);
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/helper.cc b/lite/backends/loongarch/jit/helper.cc
new file mode 100644
index 00000000000..508472f4744
--- /dev/null
+++ b/lite/backends/loongarch/jit/helper.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "lite/backends/loongarch/jit/helper.h"
+#include <algorithm>  // tolower
+#include <cstring>
+#include <numeric>
+#include <string>
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+#define ONE_CASE(key) \
+  case key:           \
+    return #key
+
+const char* to_string(KernelType kt) {
+  switch (kt) {
+    ONE_CASE(kNone);
+    ONE_CASE(kVMul);
+    ONE_CASE(kVAdd);
+    ONE_CASE(kVAddRelu);
+    ONE_CASE(kVSub);
+    ONE_CASE(kVScal);
+    ONE_CASE(kStrideScal);
+    ONE_CASE(kVAddBias);
+    ONE_CASE(kVRelu);
+    ONE_CASE(kVBroadcast);
+    ONE_CASE(kVCopy);
+    ONE_CASE(kVIdentity);
+    ONE_CASE(kVExp);
+    ONE_CASE(kVSquare);
+    ONE_CASE(kVSigmoid);
+    ONE_CASE(kVTanh);
+    ONE_CASE(kLSTMCtHt);
+    ONE_CASE(kLSTMC1H1);
+    ONE_CASE(kGRUH1);
+    ONE_CASE(kGRUHtPart1);
+    ONE_CASE(kGRUHtPart2);
+    ONE_CASE(kCRFDecoding);
+    ONE_CASE(kLayerNorm);
+    ONE_CASE(kNCHW16CMulNC);
+    ONE_CASE(kSeqPool);
+    ONE_CASE(kMatMul);
+    ONE_CASE(kHMax);
+    ONE_CASE(kHSum);
+    ONE_CASE(kStrideASum);
+    ONE_CASE(kSoftmax);
+    ONE_CASE(kEmbSeqPool);
+    ONE_CASE(kSgd);
+    default:
+      LOG(FATAL) << "Not support type: %d, or forget to add it.";
+      return "NOT JITKernel";
+  }
+  return nullptr;
+}
+
+const char* to_string(SeqPoolType tp) {
+  switch (tp) {
+    ONE_CASE(kNonePoolType);
+    ONE_CASE(kSum);
+    ONE_CASE(kAvg);
+    ONE_CASE(kSqrt);
+    default:
+      LOG(FATAL) << "Not support type: %d, or forget to add it.";
+      return "NOT PoolType";
+  }
+  return nullptr;
+}
+#undef ONE_CASE
+
+KernelType to_kerneltype(const std::string& act) {
+  std::string lower = act;
+  std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+  if (lower == "relu" || lower == "vrelu") {
+    return kVRelu;
+  } else if (lower == "identity" || lower == "videntity" || lower == "") {
+    return kVIdentity;
+  } else if (lower == "exp" || lower == "vexp") {
+    return kVExp;
+  } else if (lower == "sigmoid" || lower == "vsigmoid") {
+    return kVSigmoid;
+  } else if (lower == "tanh" || lower == "vtanh") {
+    return kVTanh;
+  }
+  LOG(FATAL) << "Not support type: %s, or forget to add this case";
+  return kNone;
+}
+
+template <>
+void pack_weights<float>(const float* src, float* dst, int n, int k) {
+  int block, rest;
+  const auto groups = packed_groups(n, k, &block, &rest);
+  std::for_each(groups.begin(), groups.end(), [&](int i) {
+    CHECK_GT(i, 0) << "each element of groups should be larger than 0.";
+  });
+  int sum = std::accumulate(groups.begin(), groups.end(), 0);
+  std::memset(dst, 0, k * sum * block * sizeof(float));
+  CHECK_GE(sum * block, n)
+      << "The packed n should be equal to or larger than n";
+
+  const int block_len = sizeof(float) * block;
+  int n_offset = 0;
+
+  for (size_t g = 0; g < groups.size(); ++g) {
+    const float* from = src + n_offset;
+    for (int j = 0; j < k; ++j) {
+      size_t copy_sz = groups[g] * block_len;
+      if (g == groups.size() - 1 && rest != 0) {
+        copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float);
+      }
+      std::memcpy(dst, from + j * n, copy_sz);
+      dst += groups[g] * block;
+    }
+    n_offset += groups[g] * block;
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
+    const T* src, T* dst, int n, int k) {
+  LOG(FATAL) << "Only support pack with float type.";
+}
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/helper.h b/lite/backends/loongarch/jit/helper.h
new file mode 100644
index 00000000000..1741bae2269
--- /dev/null
+++ b/lite/backends/loongarch/jit/helper.h
@@ -0,0 +1,268 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <utility>  // for std::move
+#include <vector>
+#include "lite/backends/loongarch/jit/gen_base.h"
+#include "lite/backends/loongarch/jit/kernel_base.h"
+#include "lite/backends/loongarch/jit/kernel_key.h"
+#include "lite/backends/loongarch/jit/kernel_pool.h"
+#include "lite/utils/log/cp_logging.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+template <typename KernelTuple, typename PlaceType>
+inline typename std::enable_if<
+    std::is_same<typename KernelTuple::data_type, float>::value,
+    const Kernel*>::type
+GetJitCode(const typename KernelTuple::attr_type& attr) {
+  using Attr = typename KernelTuple::attr_type;
+  int64_t key = JitCodeKey<Attr>(attr);
+  auto& codes = JitCodePool<KernelTuple::kernel_type>::Instance();
+  if (codes.Has(key)) {
+    return codes.AllKernels().at(key).get();
+  }
+
+  // creator is not related with attr, so can use KernelKey as key
+  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
+  // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
+  auto& creator_map = JitCodeCreatorPool::Instance().AllCreators();
+  auto iter = creator_map.find(kkey);
+  if (iter != creator_map.end()) {
+    auto& creators = iter->second;
+    for (auto& cur : creators) {
+      auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
+      if (i && i->CanBeUsed(attr)) {
+        auto p = i->CreateJitCode(attr);
+        if (p) {
+          auto res = p.get();
+          codes.Insert(key, std::move(p));
+          return res;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+template <typename KernelTuple, typename PlaceType>
+inline typename std::enable_if<
+    !std::is_same<typename KernelTuple::data_type, float>::value,
+    const Kernel*>::type
+GetJitCode(const typename KernelTuple::attr_type& attr) {
+  return nullptr;
+}
+
+// Refer code do not related with attr, which is just for cast
+// Refer is always on CPUPlace
+template <typename KernelTuple>
+inline const Kernel* GetReferKernel() {
+  auto& ref_pool = ReferKernelPool::Instance().AllKernels();
+  KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace());
+  auto ref_iter = ref_pool.find(kkey);
+  CHECK(ref_iter != ref_pool.end())
+      << "Every Kernel should have reference function.";
+  auto& ref_impls = ref_iter->second;
+  for (auto& impl : ref_impls) {
+    auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
+    if (i) {
+      return i;
+    }
+  }
+  return nullptr;
+}
+
+template <typename KernelTuple>
+inline typename KernelTuple::func_type GetReferFunc() {
+  auto ker = GetReferKernel<KernelTuple>();
+  auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
+  CHECK(p) << "The Refer kernel should exsit";
+  return p->GetFunc();
+}
+
+// Return all Kernels that can be used
+template <typename KernelTuple, typename PlaceType>
+std::vector<const Kernel*> GetAllCandidateKernels(
+    const typename KernelTuple::attr_type& attr) {
+  // the search order shoudl be jitcode > more > refer
+  std::vector<const Kernel*> res;
+  auto jitker = GetJitCode<KernelTuple, PlaceType>(attr);
+  if (jitker) {
+    res.emplace_back(jitker);
+  }
+
+  // more kernelpool: (KernelKey(type, place), vector<KernelPtr>)
+  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
+  auto& pool = KernelPool::Instance().AllKernels();
+  auto iter = pool.find(kkey);
+  if (iter != pool.end()) {
+    auto& impls = iter->second;
+    for (auto& impl : impls) {
+      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(impl.get());
+      if (i && i->CanBeUsed(attr)) {
+        res.emplace_back(i);
+      }
+    }
+  }
+
+  // The last implementation should be reference function on CPUPlace.
+  auto ref = GetReferKernel<KernelTuple>();
+  CHECK(ref != nullptr) << "Refer Kernel can not be empty.";
+  res.emplace_back(ref);
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
+std::vector<std::pair<std::string, typename KernelTuple::func_type>>
+GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
+  using Func = typename KernelTuple::func_type;
+  auto kers = GetAllCandidateKernels<KernelTuple, PlaceType>(attr);
+  std::vector<std::pair<std::string, Func>> res;
+  for (auto k : kers) {
+    std::string name = k->ImplType();
+    if (name == "JitCode") {
+      auto i = dynamic_cast<const GenBase*>(k);
+      CHECK(i) << "jitcode kernel cast can not fail.";
+      res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
+    } else {
+      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
+      CHECK(i) << "kernel cast can not fail.";
+      res.emplace_back(std::make_pair(name, i->GetFunc()));
+    }
+  }
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
+std::vector<typename KernelTuple::func_type> GetAllCandidateFuncs(
+    const typename KernelTuple::attr_type& attr) {
+  auto funcs = GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
+  std::vector<typename KernelTuple::func_type> res;
+  for (auto& i : funcs) {
+    res.emplace_back(i.second);
+  }
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
+typename KernelTuple::func_type GetDefaultBestFunc(
+    const typename KernelTuple::attr_type& attr) {
+  auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
+  CHECK_GE(funcs.size(), 1UL);
+  // Here could do some runtime benchmark of this attr and return the best one.
+  // But yet just get the first one as the default best one,
+  // which is searched in order and tuned by offline.
+  return funcs[0];
+}
+
+template <typename KernelTuple, typename PlaceType>
+class KernelFuncs {
+ public:
+  KernelFuncs() = default;
+  static KernelFuncs& Cache() {
+    static LITE_THREAD_LOCAL KernelFuncs<KernelTuple, PlaceType> g_func_cache;
+    return g_func_cache;
+  }
+
+  // the exposed interface to use
+  typename KernelTuple::func_type At(
+      const typename KernelTuple::attr_type& attr) {
+    // Maybe here is not good enough, not all kernels should have jitcode
+    int64_t key = JitCodeKey<typename KernelTuple::attr_type>(attr);
+    if (Has(key)) {
+      return funcs_.at(key);
+    }
+    // If do not have this attr in cache then get the default best
+    auto func = GetDefaultBestFunc<KernelTuple, PlaceType>(attr);
+    Insert(key, func);
+    return func;
+  }
+
+  typename KernelTuple::func_type operator[](
+      const typename KernelTuple::attr_type& attr) {
+    return At(attr);
+  }
+
+ protected:
+  bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); }
+  void Insert(int64_t key, typename KernelTuple::func_type func) {
+    funcs_.emplace(key, func);
+  }
+
+ private:
+  std::map<int64_t, typename KernelTuple::func_type> funcs_;
+};
+
+const char* to_string(KernelType kt);
+const char* to_string(SeqPoolType kt);
+
+KernelType to_kerneltype(const std::string& act);
+
+inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
+  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
+     << "],act_cand[" << to_string(attr.act_cand) << "],act_cell["
+     << to_string(attr.act_cell) << "],use_peephole["
+     << (attr.use_peephole ? "True" : "False") << "]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
+  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
+     << "],act_cand[" << to_string(attr.act_cand) << "]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
+  os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
+     << to_string(attr.type) << "]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const emb_seq_pool_attr_t& attr) {
+  os << "table_height[" << attr.table_height << "],table_width["
+     << attr.table_width << "],index_height[" << attr.index_height
+     << "],index_width[" << attr.index_width << "],output_width["
+     << attr.out_width << "],pool_type[" << to_string(attr.pool_type) << "]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const sgd_attr_t& attr) {
+  os << "param_height[" << attr.param_height << "],param_width["
+     << attr.param_width << "],grad_height[" << attr.grad_height
+     << "],grad_width[" << attr.grad_width << "],selected_rows_size["
+     << attr.selected_rows_size << "]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
+  os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
+  return os;
+}
+
+// expose the method to pack matmul weight
+template <typename T>
+void pack_weights(const T* src, T* dst, int n, int k);
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/kernel_base.h b/lite/backends/loongarch/jit/kernel_base.h
new file mode 100644
index 00000000000..3aba9fb4269
--- /dev/null
+++ b/lite/backends/loongarch/jit/kernel_base.h
@@ -0,0 +1,365 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+#include <cstdint>
+#include "lite/backends/loongarch/jit/macro.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+typedef enum {
+  kNone = 0,
+  // sort by alphabet
+  kCRFDecoding = 1,
+  kEmbSeqPool = 2,
+  kGRUH1,
+  kGRUHtPart1,
+  kGRUHtPart2,
+  kHSum,  // horizontal max
+  kHMax,  // horizontal sum
+  kLSTMCtHt,
+  kLSTMC1H1,
+  kLayerNorm,
+  kMatMul,
+  kNCHW16CMulNC,
+  kSeqPool,
+  kSoftmax,
+  kStrideASum,
+  kStrideScal,
+  kVAdd,
+  kVAddBias,
+  kVAddRelu,
+  kVBroadcast,
+  kVCopy,
+  kVExp,
+  kVIdentity,
+  kVMul,
+  kVRelu,
+  kVScal,
+  kSgd,
+  kVSigmoid,
+  kVSquare,
+  kVSub,
+  kVTanh,
+} KernelType;
+
+typedef enum {
+  kNonePoolType = 0,
+  kSum = 1,
+  kAvg,
+  kSqrt,
+} SeqPoolType;
+
+// x, y, z, n
+template <typename T>
+struct XYZNTuple {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, int);
+};
+
+// a, x, y, n
+template <typename T>
+struct AXYNTuple : public XYZNTuple<T> {};
+
+// a, x, y, n, stride
+template <typename T>
+struct AXYNSTuple {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, int, int);
+};
+
+// x, y, n
+template <typename T>
+struct XYNTuple {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, T*, int);
+};
+
+// x, returned value, n
+template <typename T>
+struct XRNTuple : public XYNTuple<T> {};
+
+// x, returned value, n, stride
+template <typename T>
+struct XRNSTuple {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, T*, int, int);
+};
+
+#define DECLARE_KERNELTUPLE(kernel_tuple, type)        \
+  template <typename T>                                \
+  struct type##Tuple : public kernel_tuple<T> {        \
+    static constexpr KernelType kernel_type = k##type; \
+  }
+
+// Tuple should be corresponding to the KernelType
+DECLARE_KERNELTUPLE(XYZNTuple, VMul);
+DECLARE_KERNELTUPLE(XYZNTuple, VAdd);
+DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu);
+DECLARE_KERNELTUPLE(XYZNTuple, VSub);
+
+DECLARE_KERNELTUPLE(AXYNTuple, VScal);
+DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
+
+DECLARE_KERNELTUPLE(AXYNSTuple, StrideScal);
+
+DECLARE_KERNELTUPLE(XYNTuple, VRelu);
+DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
+DECLARE_KERNELTUPLE(XYNTuple, VSquare);
+DECLARE_KERNELTUPLE(XYNTuple, VExp);
+DECLARE_KERNELTUPLE(XYNTuple, VSigmoid);
+DECLARE_KERNELTUPLE(XYNTuple, VTanh);
+DECLARE_KERNELTUPLE(XYNTuple, VCopy);
+
+DECLARE_KERNELTUPLE(XRNTuple, HMax);
+DECLARE_KERNELTUPLE(XRNTuple, HSum);
+
+DECLARE_KERNELTUPLE(XRNSTuple, StrideASum);
+
+typedef struct {
+  void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
+  const void* ct_1;
+  void* ct;
+  void* ht;
+  /* weight_peephole and checked data are only used in peephole*/
+  const void* wp{nullptr};  //  W_ic, W_fc, W_oc
+  void* checked{nullptr};   // size: 2 * d
+} lstm_t;
+
+typedef struct {
+  void* gates;  // gates: {x_update, x_reset; x_state}
+  const void* ht_1;
+  void* ht;
+} gru_t;
+
+struct rnn_attr_s {
+  int d;
+  KernelType act_gate, act_cand;
+  rnn_attr_s() = default;
+  explicit rnn_attr_s(int _d, KernelType _act_gate, KernelType _act_cand)
+      : d(_d), act_gate(_act_gate), act_cand(_act_cand) {}
+};
+
+struct lstm_attr_s : public rnn_attr_s {
+  bool use_peephole;
+  KernelType act_cell;
+  lstm_attr_s() = default;
+  explicit lstm_attr_s(int _d,
+                       KernelType _act_gate,
+                       KernelType _act_cand,
+                       KernelType _act_cell,
+                       bool _use_peephole = false)
+      : rnn_attr_s(_d, _act_gate, _act_cand),
+        use_peephole(_use_peephole),
+        act_cell(_act_cell) {}
+};
+
+typedef struct rnn_attr_s gru_attr_t;
+typedef struct lstm_attr_s lstm_attr_t;
+
+template <typename T>
+struct LSTMTuple {
+  typedef T data_type;
+  typedef lstm_attr_t attr_type;
+  typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
+};
+
+template <typename T>
+struct GRUTuple {
+  typedef T data_type;
+  typedef gru_attr_t attr_type;
+  typedef void (*func_type)(gru_t*, const gru_attr_t*);
+};
+
+DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt);
+DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1);
+
+DECLARE_KERNELTUPLE(GRUTuple, GRUH1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2);
+
+#undef DECLARE_KERNELTUPLE
+
+template <typename T>
+struct VBroadcastTuple {
+  static constexpr KernelType kernel_type = kVBroadcast;
+  typedef T data_type;
+  typedef int64_t attr_type;
+  typedef void (*func_type)(const T*, T*, int64_t, int64_t);
+};
+
+typedef struct seq_pool_attr_s {
+  int h, w;  // h should always be the first one
+  SeqPoolType type;
+  seq_pool_attr_s() = default;
+  explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1)
+      : h(height), w(width), type(pool_type) {}
+} seq_pool_attr_t;
+
+template <typename T>
+struct SeqPoolTuple {
+  static constexpr KernelType kernel_type = kSeqPool;
+  typedef T data_type;
+  typedef seq_pool_attr_t attr_type;
+  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
+};
+
+typedef struct emb_seq_pool_attr_s {
+  int64_t table_height, table_width;
+  int64_t index_height, index_width;
+  int64_t out_width;
+  SeqPoolType pool_type;
+  emb_seq_pool_attr_s() = default;
+  explicit emb_seq_pool_attr_s(int64_t tbl_height,
+                               int64_t tbl_width,
+                               int64_t idx_height,
+                               int64_t idx_width,
+                               int64_t output_width,
+                               SeqPoolType seqpool_type = SeqPoolType::kSum)
+      : table_height(tbl_height),
+        table_width(tbl_width),
+        index_height(idx_height),
+        index_width(idx_width),
+        out_width(output_width),
+        pool_type(seqpool_type) {}
+} emb_seq_pool_attr_t;
+
+template <typename T>
+struct EmbSeqPoolTuple {
+  static constexpr KernelType kernel_type = kEmbSeqPool;
+  typedef T data_type;
+  typedef emb_seq_pool_attr_t attr_type;
+  typedef void (*func_type)(const T*,
+                            const int64_t*,
+                            T*,
+                            const emb_seq_pool_attr_t*);
+};
+
+typedef struct sgd_attr_s {
+  int64_t param_height, param_width;
+  int64_t grad_height, grad_width;
+  int64_t selected_rows_size;
+  sgd_attr_s() = default;
+  explicit sgd_attr_s(int64_t param_h,
+                      int64_t param_w,
+                      int64_t grad_h,
+                      int64_t grad_w,
+                      int64_t selected_rows_sz)
+      : param_height(param_h),
+        param_width(param_w),
+        grad_height(grad_h),
+        grad_width(grad_w),
+        selected_rows_size(selected_rows_sz) {}
+} sgd_attr_t;
+
+template <typename T>
+struct SgdTuple {
+  static constexpr KernelType kernel_type = kSgd;
+  typedef T data_type;
+  typedef sgd_attr_t attr_type;
+  typedef void (*func_type)(
+      const T*, const T*, const T*, const int64_t*, T*, const sgd_attr_t*);
+};
+
+typedef struct matmul_attr_s {
+  int m, n, k;
+  void* packed_weight{nullptr};
+  matmul_attr_s() = default;
+  explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr)
+      : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {}
+} matmul_attr_t;
+
+template <typename T>
+struct MatMulTuple {
+  static constexpr KernelType kernel_type = kMatMul;
+  typedef T data_type;
+  typedef matmul_attr_t attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
+};
+
+template <typename T>
+struct CRFDecodingTuple {
+  static constexpr KernelType kernel_type = kCRFDecoding;
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
+};
+
+template <typename T>
+struct LayerNormTuple {
+  static constexpr KernelType kernel_type = kLayerNorm;
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(
+      T*, T*, T*, T*, const T*, const T*, int, const float, int);
+};
+
+template <typename T>
+struct SoftmaxTuple {
+  static constexpr KernelType kernel_type = kSoftmax;
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, T*, int, int, int);
+};
+
+// nChw16c = nChw16c .* NC
+template <typename T>
+struct NCHW16CMulNCTuple {
+  static constexpr KernelType kernel_type = kNCHW16CMulNC;
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, int, int);
+};
+
+// Just for adding to kernel pool without template
+class Kernel {
+ public:
+  Kernel() = default;
+  virtual ~Kernel() = default;
+  virtual const char* ImplType() const = 0;
+};
+
+template <typename KernelTuple>
+class KernelMore : public Kernel {
+ public:
+  using T = typename KernelTuple::data_type;
+  using Func = typename KernelTuple::func_type;
+  using Attr = typename KernelTuple::attr_type;
+  virtual Func GetFunc() const { return func; }
+  // specify this kernel can be used, means it should not fail if use it.
+  virtual bool CanBeUsed(const Attr& attr) const = 0;
+
+ protected:
+  Func func{nullptr};
+};
+
+template <typename KernelTuple>
+class ReferKernel : public KernelMore<KernelTuple> {
+ public:
+  // Refer code can always be used
+  bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override {
+    return true;
+  }
+  const char* ImplType() const override { return "Refer"; }
+};
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/kernel_key.cc b/lite/backends/loongarch/jit/kernel_key.cc
new file mode 100644
index 00000000000..59445f59107
--- /dev/null
+++ b/lite/backends/loongarch/jit/kernel_key.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "lite/backends/loongarch/jit/kernel_key.h"
+#include <xxhash.h>  // XXH64: 13.8 GB/s
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+template <>
+int64_t JitCodeKey<int>(const int& d) {
+  return d;
+}
+
+template <>
+int64_t JitCodeKey<int64_t>(const int64_t& d) {
+  return d;
+}
+
+template <>
+int64_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
+  return XXH64(&attr, sizeof(gru_attr_t), 0);
+}
+
+template <>
+int64_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
+  int keys[5] = {attr.d,
+                 static_cast<int>(attr.act_gate),
+                 static_cast<int>(attr.act_cand),
+                 static_cast<int>(attr.act_cell),
+                 static_cast<int>(attr.use_peephole)};
+  return XXH64(keys, sizeof(int) * 5, 0);
+}
+
+template <>
+int64_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
+  int keys[2] = {attr.w, static_cast<int>(attr.type)};
+  return XXH64(keys, sizeof(int) * 2, 0);
+}
+
+template <>
+int64_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
+  return XXH64(&attr, sizeof(int) * 3, 0);  // m, n, k
+}
+
+template <>
+int64_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
+  return attr.table_width;
+}
+
+template <>
+int64_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
+  return attr.grad_width;
+}
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/kernel_key.h b/lite/backends/loongarch/jit/kernel_key.h
new file mode 100644
index 00000000000..4f0c6fd9549
--- /dev/null
+++ b/lite/backends/loongarch/jit/kernel_key.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+#include <cstddef>
+#include <functional>
+#include "lite/backends/loongarch/jit/kernel_base.h"
+#include "lite/backends/loongarch/legacy_place.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+struct KernelKey {
+  struct Hash {
+    size_t operator()(const KernelKey& key) const {
+      int place = key.place_.which();               // less than 2^8
+      int type = static_cast<int>(key.type_) << 8;  // less than 2^(32-8)
+      std::hash<int> hasher;
+      return hasher(place + type);
+    }
+  };
+
+  KernelType type_;
+  lite::fluid::Place place_;
+
+  KernelKey(KernelType type, lite::fluid::Place place)
+      : type_(type), place_(place) {}
+  size_t hash_key() const { return Hash()(*this); }
+
+  bool operator==(const KernelKey& o) const {
+    return /*platform::places_are_same_class(place_, o.place_)*/ true &&
+           type_ == o.type_;
+  }
+  bool operator!=(const KernelKey& o) const { return !(*this == o); }
+};
+
+// Every JitCode should have a method to get the key from attribution
+template <typename Attr>
+int64_t JitCodeKey(const Attr& attr);
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/kernel_pool.cc b/lite/backends/loongarch/jit/kernel_pool.cc
new file mode 100644
index 00000000000..0dcffb6c7c8
--- /dev/null
+++ b/lite/backends/loongarch/jit/kernel_pool.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "lite/backends/loongarch/jit/kernel_pool.h"
+#include <map>
+#include <memory>  // for shared_ptr
+#include <string>
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+JitCodeCreatorPool& JitCodeCreatorPool::Instance() {
+  static JitCodeCreatorPool g_creator_pool;
+  return g_creator_pool;
+}
+
+KernelPool& KernelPool::Instance() {
+  static KernelPool g_kernel_pool;
+  return g_kernel_pool;
+}
+
+ReferKernelPool& ReferKernelPool::Instance() {
+  static ReferKernelPool g_refer_kernel_pool;
+  return g_refer_kernel_pool;
+}
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/kernel_pool.h b/lite/backends/loongarch/jit/kernel_pool.h
new file mode 100644
index 00000000000..5e46324055c
--- /dev/null
+++ b/lite/backends/loongarch/jit/kernel_pool.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <memory>  // for unique_ptr
+#include <string>
+#include <unordered_map>
+#include <utility>  // for move
+#include <vector>
+#include "lite/backends/loongarch/jit/gen_base.h"
+#include "lite/backends/loongarch/jit/kernel_base.h"
+#include "lite/backends/loongarch/jit/kernel_key.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+template <KernelType KT>
+class JitCodePool {
+  typedef std::unique_ptr<GenBase> GenBasePtr;
+  typedef std::unordered_map<int64_t, GenBasePtr> JitCodeMap;
+
+ public:
+  JitCodePool() = default;
+  static JitCodePool& Instance() {
+    static LITE_THREAD_LOCAL JitCodePool<KT> g_jit_codes;
+    return g_jit_codes;
+  }
+
+  const JitCodeMap& AllKernels() { return codes_; }
+
+  bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); }
+
+  void Insert(int64_t key, GenBasePtr value) {
+    codes_.emplace(key, std::move(value));
+  }
+
+ private:
+  JitCodeMap codes_;
+};
+
+class JitCodeCreatorPool {
+  typedef std::unique_ptr<const GenCreator> GenCreatorPtr;
+  typedef std::unordered_map<KernelKey,
+                             std::vector<GenCreatorPtr>,
+                             KernelKey::Hash>
+      GenCreatorPtrMap;
+
+ public:
+  JitCodeCreatorPool() = default;
+  static JitCodeCreatorPool& Instance();
+  GenCreatorPtrMap& AllCreators() { return creators_; }
+  void Insert(const KernelKey& key, GenCreatorPtr value) {
+    if (creators_.find(key) == creators_.end()) {
+      creators_.emplace(key, std::vector<GenCreatorPtr>());
+    }
+    creators_.at(key).emplace_back(std::move(value));
+  }
+
+ private:
+  GenCreatorPtrMap creators_;
+};
+
+typedef std::unique_ptr<const Kernel> KernelPtr;
+typedef std::unordered_map<KernelKey, std::vector<KernelPtr>, KernelKey::Hash>
+    KernelMap;
+
+class KernelPool {
+ public:
+  static KernelPool& Instance();
+  KernelPool() = default;
+  KernelMap& AllKernels() { return pool_; }
+  void Insert(const KernelKey& key, KernelPtr value) {
+    if (pool_.find(key) == pool_.end()) {
+      pool_.emplace(key, std::vector<KernelPtr>());
+    }
+    pool_.at(key).emplace_back(std::move(value));
+  }
+
+ private:
+  KernelMap pool_;
+};
+
+// Every kernel should have refer code and it should be used in unit tests,
+// so refer kernels should have it's independent kernel pool
+class ReferKernelPool {
+ public:
+  static ReferKernelPool& Instance();
+  ReferKernelPool() = default;
+  KernelMap& AllKernels() { return pool_; }
+  void Insert(const KernelKey& key, KernelPtr value) {
+    if (pool_.find(key) == pool_.end()) {
+      pool_.emplace(key, std::vector<KernelPtr>());
+    }
+    pool_.at(key).emplace_back(std::move(value));
+  }
+
+ private:
+  KernelMap pool_;
+};
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/macro.h b/lite/backends/loongarch/jit/macro.h
new file mode 100644
index 00000000000..27de0b99813
--- /dev/null
+++ b/lite/backends/loongarch/jit/macro.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+#include <type_traits>
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+#define LSX_FLOAT_BLOCK 4
+#define LASX_FLOAT_BLOCK 8
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/more/CMakeLists.txt b/lite/backends/loongarch/jit/more/CMakeLists.txt
new file mode 100644
index 00000000000..4e9dcb8fd2d
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/CMakeLists.txt
@@ -0,0 +1,10 @@
+function(USE_JITKERNEL_MORE_LITE TARGET TYPE)
+    file(APPEND ${jit_file} "USE_JITKERNEL_MORE_LITE(${TARGET} ${TYPE});\n")
+endfunction()
+
+IF(WITH_LASX)
+  ADD_SUBDIRECTORY(intrinsic)
+ENDIF()
+
+# mix should be last
+add_subdirectory(mix)
diff --git a/lite/backends/loongarch/jit/more/intrinsic/CMakeLists.txt b/lite/backends/loongarch/jit/more/intrinsic/CMakeLists.txt
new file mode 100644
index 00000000000..5563e39bb87
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/intrinsic/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB jit_kernel_cc_intrinsic "${CMAKE_CURRENT_SOURCE_DIR}/*.cc")
+set(LOONGARCH_JIT_MORE_SRC ${LOONGARCH_JIT_MORE_SRC} ${jit_kernel_cc_intrinsic} CACHE INTERNAL "")
+
+# use mkl kernels by name and type
+USE_JITKERNEL_MORE_LITE(kCRFDecoding, intrinsic)
+USE_JITKERNEL_MORE_LITE(kLayerNorm, intrinsic)
diff --git a/lite/backends/loongarch/jit/more/intrinsic/crf_decoding.cc b/lite/backends/loongarch/jit/more/intrinsic/crf_decoding.cc
new file mode 100644
index 00000000000..9899a2c41b3
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/intrinsic/crf_decoding.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "lite/backends/loongarch/jit/more/intrinsic/crf_decoding.h"
+#include <limits>
+#include "lite/backends/loongarch/cpu_info.h"
+#include "lite/backends/loongarch/jit/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+namespace more {
+namespace intrinsic {
+
+void CRFDecoding(const int seq_len,
+                 const float* x,
+                 const float* w,
+                 float* alpha,
+                 int* track,
+                 int tag_num) {
+  const int step_size = LASX_FLOAT_BLOCK;
+  const int end = tag_num / step_size;
+  const int rest = tag_num % step_size;
+  /* Setup the alpha initial value.*/
+  int i_offset = 0;
+  int last_offset = rest - step_size;
+  for (int i = 0; i <= end; ++i) {
+    // weights, input and alpha values.
+    __m256 w_content, x_content, alpha_content;
+    // Load the relevant data into the variables from un-aligned address.
+    w_content = lasx_loadu_f32(w + i_offset);
+    x_content = lasx_loadu_f32(x + i_offset);
+    alpha_content = lasx_add_f32(w_content, x_content);
+    lasx_storeu_f32(alpha + i_offset, alpha_content);
+    i_offset += step_size;
+    if (i == end - 1) {
+      if (rest > 0) {
+        i_offset += last_offset;
+      } else {
+        break;
+      }
+    }
+  }
+  // Use the column-major strategy to get the location of maximum score.
+  int seq_offset = 0;
+  constexpr int state_trans_base_idx = 2;
+  for (int k = 1; k < seq_len; ++k) {
+    int j_offset = 0;
+    for (int j = 0; j <= end; ++j) {
+/* Initialize the variables of maximum score and location.*/
+      __m256 max_score = lasx_set1_f32(-std::numeric_limits<float>::max());
+      __m256i max_j = lasx_set1_i32(0);
+      /* Calculate the offset of transition_weights.*/
+      int trans_offset = state_trans_base_idx * tag_num + j_offset;
+      for (int i = 0; i < tag_num; ++i) {
+/* Initalize the content of alpha variable with related offset.*/
+        __m256 alpha_content = lasx_broadcast_1f32(alpha + seq_offset + i);
+        /* Obtain the content of weights from un-aligned address.*/
+        __m256 w_content = lasx_loadu_f32(w + trans_offset);
+        __m256 score_v = lasx_add_f32(alpha_content, w_content);
+        __m256i mask =
+            lasx_castf32_m256i(lasx_xvfcmp_slt_s(max_score, score_v));
+/* According to the mask value, update the index of the max_score.*/
+        max_j = lasx_or_m256i(lasx_andnot_m256i(mask, max_j),
+                                lasx_and_m256i(mask, lasx_set1_i32(i)));
+        /* Update the max_score value.*/
+        max_score = lasx_max_f32(max_score, score_v);
+
+        trans_offset += tag_num;
+      }
+/* Update the alpha and track values. */
+      __m256 x_content = lasx_loadu_f32(x + seq_offset + tag_num + j_offset);
+      max_score = lasx_add_f32(max_score, x_content);
+      lasx_storeu_f32(alpha + seq_offset + tag_num + j_offset, max_score);
+      lasx_storeu_m256i(
+          reinterpret_cast<__m256i*>(track + seq_offset + tag_num + j_offset),
+          max_j);
+
+      /* Calculate the offset of next step*/
+      j_offset += step_size;
+      if (j == end - 1) {
+        if (rest > 0) {
+          j_offset += last_offset;
+        } else {
+          break;
+        }
+      }
+    }
+    seq_offset += tag_num;
+  }
+}
+
+bool CRFDecodingKernel::CanBeUsed(const int& d) const {
+  constexpr int block = LASX_FLOAT_BLOCK;
+  return loongarch::MayIUse(loongarch::lasx) && d >= block;
+}
+
+}  // namespace intrinsic
+}  // namespace more
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
+
+namespace intrinsic = paddle::lite::jit::more::intrinsic;
+
+REGISTER_JITKERNEL_MORE(kCRFDecoding, intrinsic, intrinsic::CRFDecodingKernel);
diff --git a/lite/backends/loongarch/jit/more/intrinsic/crf_decoding.h b/lite/backends/loongarch/jit/more/intrinsic/crf_decoding.h
new file mode 100644
index 00000000000..f8ccb27042f
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/intrinsic/crf_decoding.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include "lite/backends/loongarch/jit/kernel_base.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+namespace more {
+namespace intrinsic {
+
+void CRFDecoding(const int seq_len,
+                 const float* x,
+                 const float* w,
+                 float* alpha,
+                 int* track,
+                 int tag_num);
+
+class CRFDecodingKernel : public KernelMore<CRFDecodingTuple<float>> {
+ public:
+  CRFDecodingKernel() { this->func = CRFDecoding; }
+  bool CanBeUsed(
+      const typename CRFDecodingTuple<float>::attr_type&) const override;
+  const char* ImplType() const override { return "Intrinsic"; }
+};
+
+}  // namespace intrinsic
+}  // namespace more
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/more/intrinsic/layer_norm.cc b/lite/backends/loongarch/jit/more/intrinsic/layer_norm.cc
new file mode 100644
index 00000000000..ecb2f87958e
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/intrinsic/layer_norm.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "lite/backends/loongarch/jit/more/intrinsic/layer_norm.h"
+#include <limits>
+#include "lite/backends/loongarch/jit/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+namespace more {
+namespace intrinsic {
+
+void LayerNorm(float* x,
+               float* out,
+               float* mean,
+               float* var,
+               const float* scale,
+               const float* bias,
+               int height,
+               const float epsilon,
+               int right) {
+  __m256 sum;
+  __m256 mean_vec, var_vec;
+  __m128 hi, lo;
+  __m256 tmp;
+  size_t offset;
+  size_t j;
+  int block = LASX_FLOAT_BLOCK;
+  const int rest = right % block;
+  const int end = right - rest;
+
+  __m256 reverse_num_vec =
+      lasx_div_f32(lasx_set1_f32(1.0), lasx_set1_f32(right));
+  __m256 epsilon_vec = lasx_set1_f32(epsilon);
+  int rest_mask =
+      ((-1) & (~((~0U) >> (sizeof(int) * 8 - (block - rest))))) & 0x0ff;
+  __m256i mask_vec = lasx_set_i32(rest_mask & 0x80 ? 0xffffffff : 0,
+                                      rest_mask & 0x40 ? 0xffffffff : 0,
+                                      rest_mask & 0x20 ? 0xffffffff : 0,
+                                      rest_mask & 0x10 ? 0xffffffff : 0,
+                                      rest_mask & 0x8 ? 0xffffffff : 0,
+                                      rest_mask & 0x4 ? 0xffffffff : 0,
+                                      rest_mask & 0x2 ? 0xffffffff : 0,
+                                      rest_mask & 0x1 ? 0xffffffff : 0);
+
+  for (int i = 0; i < height; ++i) {
+    offset = i * right;
+
+    /* get mean */
+    sum = lasx_setzero_f32();
+    for (j = offset; j < end + offset; j += block) {
+      sum = lasx_add_f32(sum, lasx_loadu_f32((const float*)x + j));
+    }
+    if (rest != 0) {
+      j = offset + right - block;
+      tmp = lasx_loadu_f32((const float*)x + j);
+      tmp = lasx_blendv_f32(lasx_setzero_f32(),
+                             tmp,
+                             *(__m256*)&mask_vec);  // NOLINT
+      sum = lasx_add_f32(sum, tmp);
+    }
+    hi = lasx_extractf128_f32(sum, 1);
+    lo = lasx_extractf128_f32(sum, 0);
+    sum = lasx_add_f32(
+        sum,
+        lasx_insertf128_f32(
+            lasx_insertf128_f32(lasx_setzero_f32(), hi, 0), lo, 1));
+    sum = lasx_hadd_f32(sum, sum);
+    sum = lasx_hadd_f32(sum, sum);
+    mean_vec = lasx_mul_f32(sum, reverse_num_vec);
+    mean[i] = *reinterpret_cast<float*>(&mean_vec);
+
+    /* get variance */
+    sum = lasx_setzero_f32();
+    for (j = offset; j < end + offset; j += block) {
+      tmp = lasx_sub_f32(lasx_loadu_f32((const float*)x + j), mean_vec);
+      tmp = lasx_mul_f32(tmp, tmp);
+      sum = lasx_add_f32(sum, tmp);
+    }
+    if (rest != 0) {
+      j = offset + right - block;
+      tmp = lasx_sub_f32(lasx_loadu_f32((const float*)x + j), mean_vec);
+      tmp = lasx_mul_f32(tmp, tmp);
+      tmp = lasx_blendv_f32(lasx_setzero_f32(),
+                             tmp,
+                             *(__m256*)&mask_vec);  // NOLINT
+      sum = lasx_add_f32(sum, tmp);
+    }
+    hi = lasx_extractf128_f32(sum, 1);
+    lo = lasx_extractf128_f32(sum, 0);
+    sum = lasx_add_f32(
+        sum,
+        lasx_insertf128_f32(
+            lasx_insertf128_f32(lasx_setzero_f32(), hi, 0), lo, 1));
+    sum = lasx_hadd_f32(sum, sum);
+    sum = lasx_hadd_f32(sum, sum);
+    var_vec = lasx_mul_f32(sum, reverse_num_vec);
+    var[i] = *reinterpret_cast<float*>(&var_vec);
+
+    /* get x_norm and calculate output*/
+    for (j = offset; j < end + offset; j += block) {
+      tmp = lasx_sub_f32(lasx_loadu_f32((const float*)x + j), mean_vec);
+      tmp = lasx_div_f32(tmp,
+                          lasx_sqrt_f32(lasx_add_f32(var_vec, epsilon_vec)));
+      lasx_storeu_f32(reinterpret_cast<float*>(out) + j, tmp);
+    }
+    if (rest != 0) {
+      j = offset + right - block;
+      tmp = lasx_sub_f32(lasx_loadu_f32((const float*)x + j), mean_vec);
+      tmp = lasx_div_f32(tmp,
+                          lasx_sqrt_f32(lasx_add_f32(var_vec, epsilon_vec)));
+      lasx_storeu_f32(reinterpret_cast<float*>(out) + j, tmp);
+    }
+
+    if (scale) {
+      if (rest != 0) {
+        j = offset + right - block;
+        tmp = lasx_loadu_f32((const float*)out + j);
+      }
+      for (j = offset; j < end + offset; j += block) {
+        lasx_storeu_f32(
+            reinterpret_cast<float*>(out) + j,
+            lasx_mul_f32(lasx_loadu_f32((const float*)out + j),
+                          lasx_loadu_f32((const float*)scale + j - offset)));
+      }
+      if (rest != 0) {
+        j = offset + right - block;
+        lasx_storeu_f32(
+            reinterpret_cast<float*>(out) + j,
+            lasx_mul_f32(tmp,
+                          lasx_loadu_f32((const float*)scale + j - offset)));
+      }
+    }
+
+    if (bias) {
+      if (rest != 0) {
+        j = offset + right - block;
+        tmp = lasx_loadu_f32((const float*)out + j);
+      }
+      for (j = offset; j < end + offset; j += block) {
+        lasx_storeu_f32(
+            reinterpret_cast<float*>(out) + j,
+            lasx_add_f32(lasx_loadu_f32((const float*)out + j),
+                          lasx_loadu_f32((const float*)bias + j - offset)));
+      }
+      if (rest != 0) {
+        j = offset + right - block;
+        lasx_storeu_f32(
+            reinterpret_cast<float*>(out) + j,
+            lasx_add_f32(tmp,
+                          lasx_loadu_f32((const float*)bias + j - offset)));
+      }
+    }
+  }
+}
+
+bool LayerNormKernel::CanBeUsed(const int& d) const {
+  return loongarch::MayIUse(loongarch::lasx) && d >= LASX_FLOAT_BLOCK;
+}
+
+}  // namespace intrinsic
+}  // namespace more
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
+
+namespace intrinsic = paddle::lite::jit::more::intrinsic;
+
+REGISTER_JITKERNEL_MORE(kLayerNorm, intrinsic, intrinsic::LayerNormKernel);
diff --git a/lite/backends/loongarch/jit/more/intrinsic/layer_norm.h b/lite/backends/loongarch/jit/more/intrinsic/layer_norm.h
new file mode 100644
index 00000000000..a422df058a7
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/intrinsic/layer_norm.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include "lite/backends/loongarch/jit/kernel_base.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+namespace more {
+namespace intrinsic {
+
+void LayerNorm(float* x,
+               float* out,
+               float* mean,
+               float* var,
+               const float* scale,
+               const float* bias,
+               int height,
+               const float epsilon,
+               int right);
+
+class LayerNormKernel : public KernelMore<LayerNormTuple<float>> {
+ public:
+  LayerNormKernel() { this->func = LayerNorm; }
+  bool CanBeUsed(
+      const typename LayerNormTuple<float>::attr_type&) const override;
+  const char* ImplType() const override { return "Intrinsic"; }
+};
+
+}  // namespace intrinsic
+}  // namespace more
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/more/mix/CMakeLists.txt b/lite/backends/loongarch/jit/more/mix/CMakeLists.txt
new file mode 100644
index 00000000000..f2e94a159a7
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/mix/CMakeLists.txt
@@ -0,0 +1,11 @@
+file(GLOB jit_kernel_mix_cc "${CMAKE_CURRENT_SOURCE_DIR}/*.cc")
+set(LOONGARCH_JIT_MORE_SRC ${LOONGARCH_JIT_MORE_SRC} ${jit_kernel_mix_cc} CACHE INTERNAL "")
+
+USE_JITKERNEL_MORE_LITE(kVSigmoid, mix)
+USE_JITKERNEL_MORE_LITE(kVTanh, mix)
+USE_JITKERNEL_MORE_LITE(kLSTMCtHt, mix)
+USE_JITKERNEL_MORE_LITE(kLSTMC1H1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUH1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUHtPart1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUHtPart2, mix)
+USE_JITKERNEL_MORE_LITE(kSoftmax, mix)
diff --git a/lite/backends/loongarch/jit/more/mix/mix.cc b/lite/backends/loongarch/jit/more/mix/mix.cc
new file mode 100644
index 00000000000..443a62b5e1e
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/mix/mix.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "lite/backends/loongarch/jit/more/mix/mix.h"
+#include "lite/backends/loongarch/jit/kernels.h"
+#include "lite/backends/loongarch/jit/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+namespace more {
+namespace mix {
+
+using CPUPlace = lite::fluid::CPUPlace;
+
+void VSigmoid(const T* x, T* y, int n) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  auto compute = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
+  compute(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+void VTanh(const T* x, T* y, int n) {
+  const T a = 2, b = -1;
+  auto compute_scal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_addbias = KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_sigmoid = KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(n);
+  compute_scal(&a, x, y, n);
+  compute_sigmoid(y, y, n);
+  compute_scal(&a, y, y, n);
+  compute_addbias(&b, y, y, n);
+}
+
+// remain is the product of dimension shapes after the axis dimension
+void Softmax(const T* x, T* y, int n, int bs, int remain) {
+  auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_strideasum =
+      KernelFuncs<StrideASumTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_stridescal =
+      KernelFuncs<StrideScalTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_vaddbias =
+      KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
+
+  for (int i = 0; i < bs; ++i) {
+    T scalar;
+    compute_hmax(x, &scalar, n);
+    scalar = static_cast<T>(0) - scalar;
+    compute_vaddbias(&scalar, x, y, n);  // x - max
+    compute_vexp(y, y, n);
+    if (remain == 1) {
+      compute_hsum(y, &scalar, n);
+      scalar = static_cast<T>(1) / scalar;
+      compute_vscal(&scalar, y, y, n);
+    } else {
+      for (int j = 0; j < remain; ++j) {
+        compute_strideasum(&y[j], &scalar, n, remain);
+        scalar = static_cast<T>(1) / scalar;
+        compute_stridescal(&scalar, &y[j], &y[j], n - j, remain);
+      }
+    }
+    x += n;
+    y += n;
+  }
+}
+
+void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
+  if (type == kVSigmoid) {
+    return KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(d);
+  } else if (type == kVRelu) {
+    return KernelFuncs<VReluTuple<T>, CPUPlace>::Cache().At(d);
+  } else if (type == kVTanh) {
+    return KernelFuncs<VTanhTuple<T>, CPUPlace>::Cache().At(d);
+  } else if (type == kVIdentity) {
+    return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
+  }
+  LOG(FATAL) << "Not support type: " << type;
+  return nullptr;
+}
+
+void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* wp = reinterpret_cast<const T*>(step->wp);
+  T* checked = reinterpret_cast<T*>(step->checked);
+  const int d = attr->d;
+  const int d2 = d * 2;
+  const int d3 = d * 3;
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
+  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
+  auto vadd_d2 = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d2);
+  auto act_gate_d = getActFunc(attr->act_gate, d);
+  auto act_gate_d2 = getActFunc(attr->act_gate, d2);
+  auto act_gate_d3 = getActFunc(attr->act_gate, d3);
+  auto act_cand_d = getActFunc(attr->act_cand, d);
+  auto act_cell_d = getActFunc(attr->act_cell, d);
+
+  if (attr->use_peephole) {
+    vmul_d(wp, ct_1, checked, d);
+    vmul_d(wp + d, ct_1, checked + d, d);
+    vadd_d2(checked, gates + d, gates + d, d2);
+    act_gate_d2(gates + d, gates + d, d2);
+  } else {
+    act_gate_d3(gates + d, gates + d, d3);
+  }
+
+  // C_t = C_t-1 * fgated + cand_gated * igated
+  act_cand_d(gates, gates, d);
+  vmul_d(gates, gates + d, gates + d, d);
+  vmul_d(ct_1, gates + d2, gates + d2, d);
+  vadd_d(gates + d, gates + d2, ct, d);
+
+  if (attr->use_peephole) {
+    // get ogated
+    vmul_d(wp + d2, ct, gates + d, d);
+    vadd_d(gates + d, gates + d3, gates + d3, d);
+    act_gate_d(gates + d3, gates + d3, d);
+  }
+  // H_t = act_cell(C_t) * ogated
+  act_cell_d(ct, gates + d2, d);
+  vmul_d(gates + d2, gates + d3, ht, d);
+}
+
+void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
+  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
+  auto act_gate_d = getActFunc(attr->act_gate, d);
+  auto act_cand_d = getActFunc(attr->act_cand, d);
+  auto act_cell_d = getActFunc(attr->act_cell, d);
+  /* C_t = igated * cgated*/
+  act_gate_d(gates + d, gates + d, d);
+  act_cand_d(gates, gates, d);
+  vmul_d(gates, gates + d, ct, d);
+  if (attr->use_peephole) {
+    // get outgated, put W_oc * C_t on igated
+    const T* wp = reinterpret_cast<const T*>(step->wp);
+    vmul_d(wp + d2, ct, gates + d, d);
+    vadd_d(gates + d, gates + d3, gates + d3, d);
+  }
+  /* H_t = act_cell(C_t) * ogated */
+  act_gate_d(gates + d3, gates + d3, d);
+  act_cell_d(ct, gates + d2, d);
+  vmul_d(gates + d2, gates + d3, ht, d);
+}
+
+// compute h1 without h0
+void GRUH1(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  int d = attr->d;
+  int d2 = d * 2;
+  auto act_gate = getActFunc(attr->act_gate, d);
+  auto act_cand = getActFunc(attr->act_cand, d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
+  act_gate(gates, gates, d);
+  act_cand(gates + d2, gates + d2, d);
+  vmul_d(gates, gates + d2, ht, d);
+}
+
+// compute the first part of GRU: ht = act_gate(r) * ht_1
+void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
+  // W: {W_update, W_reset; W_state}
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc(attr->act_gate, attr->d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(attr->d);
+  act_gate(gates + attr->d, gates + attr->d, attr->d);
+  vmul_d(ht_1, gates + attr->d, ht, attr->d);
+}
+
+// compute the second part of GRU:
+// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  int d = attr->d;
+  auto act_gate = getActFunc(attr->act_gate, d);
+  auto act_cand = getActFunc(attr->act_cand, d);
+  T* y = gates + d * 2;
+  act_gate(gates, gates, d);
+  act_cand(y, y, d);
+  // out = zt*ht~ + (1-zt)*ht_1
+  for (int i = 0; i < d; ++i) {
+    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+  }
+}
+
+// TODO(TJ): tuning me
+bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; }
+
+bool VTanhKernel::CanBeUsed(const int& d) const { return true; }
+
+bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; }
+
+bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
+
+bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
+
+bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
+
+bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
+
+bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
+
+}  // namespace mix
+}  // namespace more
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
+
+namespace mix = paddle::lite::jit::more::mix;
+
+#define REGISTER_MORE_KERNEL(func) \
+  REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel)
+
+REGISTER_MORE_KERNEL(VSigmoid);
+REGISTER_MORE_KERNEL(VTanh);
+REGISTER_MORE_KERNEL(Softmax);
+REGISTER_MORE_KERNEL(LSTMCtHt);
+REGISTER_MORE_KERNEL(LSTMC1H1);
+REGISTER_MORE_KERNEL(GRUH1);
+REGISTER_MORE_KERNEL(GRUHtPart1);
+REGISTER_MORE_KERNEL(GRUHtPart2);
+
+#undef REGISTER_MORE_KERNEL
diff --git a/lite/backends/loongarch/jit/more/mix/mix.h b/lite/backends/loongarch/jit/more/mix/mix.h
new file mode 100644
index 00000000000..b059fae7009
--- /dev/null
+++ b/lite/backends/loongarch/jit/more/mix/mix.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include "lite/backends/loongarch/jit/kernel_base.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+namespace more {
+namespace mix {
+using T = float;
+
+void VSigmoid(const T* x, T* y, int n);
+void VTanh(const T* x, T* y, int n);
+void Softmax(const T* x, T* y, int n, int bs, int remain);
+
+void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
+void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
+void GRUH1(gru_t* step, const gru_attr_t* attr);
+void GRUHtPart1(gru_t* step, const gru_attr_t* attr);
+void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
+
+#define DECLARE_MORE_KERNEL(name)                                             \
+  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
+   public:                                                                    \
+    name##Kernel() { this->func = name; }                                     \
+    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
+    const char* ImplType() const override { return "Mixed"; }                 \
+  }
+
+// XYN
+DECLARE_MORE_KERNEL(VSigmoid);
+DECLARE_MORE_KERNEL(VTanh);
+
+// XRN
+DECLARE_MORE_KERNEL(Softmax);
+
+DECLARE_MORE_KERNEL(LSTMCtHt);
+DECLARE_MORE_KERNEL(LSTMC1H1);
+
+DECLARE_MORE_KERNEL(GRUH1);
+DECLARE_MORE_KERNEL(GRUHtPart1);
+DECLARE_MORE_KERNEL(GRUHtPart2);
+
+#undef DECLARE_MORE_KERNEL
+
+}  // namespace mix
+}  // namespace more
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/refer/CMakeLists.txt b/lite/backends/loongarch/jit/refer/CMakeLists.txt
new file mode 100644
index 00000000000..6e25242a897
--- /dev/null
+++ b/lite/backends/loongarch/jit/refer/CMakeLists.txt
@@ -0,0 +1,36 @@
+function(USE_JITKERNEL_REFER_LITE TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_REFER_LITE(${TARGET});\n")
+endfunction()
+
+# use refer kernel by name
+USE_JITKERNEL_REFER_LITE(kVMul)
+USE_JITKERNEL_REFER_LITE(kVAdd)
+USE_JITKERNEL_REFER_LITE(kVAddRelu)
+USE_JITKERNEL_REFER_LITE(kVSub)
+USE_JITKERNEL_REFER_LITE(kVScal)
+USE_JITKERNEL_REFER_LITE(kStrideScal)
+USE_JITKERNEL_REFER_LITE(kVAddBias)
+USE_JITKERNEL_REFER_LITE(kVCopy)
+USE_JITKERNEL_REFER_LITE(kVRelu)
+USE_JITKERNEL_REFER_LITE(kVIdentity)
+USE_JITKERNEL_REFER_LITE(kVExp)
+USE_JITKERNEL_REFER_LITE(kVSigmoid)
+USE_JITKERNEL_REFER_LITE(kVTanh)
+USE_JITKERNEL_REFER_LITE(kLSTMCtHt)
+USE_JITKERNEL_REFER_LITE(kLSTMC1H1)
+USE_JITKERNEL_REFER_LITE(kGRUH1)
+USE_JITKERNEL_REFER_LITE(kGRUHtPart1)
+USE_JITKERNEL_REFER_LITE(kGRUHtPart2)
+USE_JITKERNEL_REFER_LITE(kCRFDecoding)
+USE_JITKERNEL_REFER_LITE(kLayerNorm)
+USE_JITKERNEL_REFER_LITE(kNCHW16CMulNC)
+USE_JITKERNEL_REFER_LITE(kSeqPool)
+USE_JITKERNEL_REFER_LITE(kMatMul)
+USE_JITKERNEL_REFER_LITE(kVSquare)
+USE_JITKERNEL_REFER_LITE(kHSum)
+USE_JITKERNEL_REFER_LITE(kHMax)
+USE_JITKERNEL_REFER_LITE(kStrideASum)
+USE_JITKERNEL_REFER_LITE(kSoftmax)
+USE_JITKERNEL_REFER_LITE(kEmbSeqPool)
+USE_JITKERNEL_REFER_LITE(kSgd)
+USE_JITKERNEL_REFER_LITE(kVBroadcast)
diff --git a/lite/backends/loongarch/jit/refer/refer.cc b/lite/backends/loongarch/jit/refer/refer.cc
new file mode 100644
index 00000000000..cf17af188ca
--- /dev/null
+++ b/lite/backends/loongarch/jit/refer/refer.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "lite/backends/loongarch/jit/refer/refer.h"
+#include "lite/backends/loongarch/jit/registry.h"
+
+namespace refer = paddle::lite::jit::refer;
+
+#define REGISTER_REFER_KERNEL(func) \
+  REGISTER_JITKERNEL_REFER_LITE(    \
+      k##func, refer::func##Kernel<float>, refer::func##Kernel<double>)
+
+REGISTER_REFER_KERNEL(VMul);
+REGISTER_REFER_KERNEL(VAdd);
+REGISTER_REFER_KERNEL(VAddRelu);
+REGISTER_REFER_KERNEL(VSub);
+
+REGISTER_REFER_KERNEL(VScal);
+REGISTER_REFER_KERNEL(StrideScal);
+REGISTER_REFER_KERNEL(VAddBias);
+
+REGISTER_REFER_KERNEL(VRelu);
+REGISTER_REFER_KERNEL(VCopy);
+REGISTER_REFER_KERNEL(VIdentity);
+REGISTER_REFER_KERNEL(VSquare);
+REGISTER_REFER_KERNEL(VExp);
+REGISTER_REFER_KERNEL(VSigmoid);
+REGISTER_REFER_KERNEL(VTanh);
+
+REGISTER_REFER_KERNEL(LSTMCtHt);
+REGISTER_REFER_KERNEL(LSTMC1H1);
+
+REGISTER_REFER_KERNEL(GRUH1);
+REGISTER_REFER_KERNEL(GRUHtPart1);
+REGISTER_REFER_KERNEL(GRUHtPart2);
+
+REGISTER_REFER_KERNEL(CRFDecoding);
+REGISTER_REFER_KERNEL(LayerNorm);
+REGISTER_REFER_KERNEL(NCHW16CMulNC);
+REGISTER_REFER_KERNEL(SeqPool);
+REGISTER_REFER_KERNEL(MatMul);
+REGISTER_REFER_KERNEL(HMax);
+REGISTER_REFER_KERNEL(HSum);
+REGISTER_REFER_KERNEL(StrideASum);
+REGISTER_REFER_KERNEL(Softmax);
+REGISTER_REFER_KERNEL(EmbSeqPool);
+REGISTER_REFER_KERNEL(Sgd);
+REGISTER_REFER_KERNEL(VBroadcast);
+
+#undef REGISTER_REFER_KERNEL
diff --git a/lite/backends/loongarch/jit/refer/refer.h b/lite/backends/loongarch/jit/refer/refer.h
new file mode 100644
index 00000000000..69d5baa3be5
--- /dev/null
+++ b/lite/backends/loongarch/jit/refer/refer.h
@@ -0,0 +1,602 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <string>
+#include "lite/backends/loongarch/jit/helper.h"
+#include "lite/backends/loongarch/jit/kernel_base.h"
+#include "lite/backends/loongarch/jit/macro.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+namespace refer {
+
+// Refer code only focus on correctness
+template <typename T>
+void VMul(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <typename T>
+void VAdd(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+template <typename T>
+void VAddRelu(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+    z[i] = z[i] > 0 ? z[i] : 0;
+  }
+}
+
+template <typename T>
+void VSub(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] - y[i];
+  }
+}
+
+template <typename T>
+void VScal(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] * x[i];
+  }
+}
+
+template <typename T>
+void VAddBias(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] + x[i];
+  }
+}
+
+template <typename T>
+void VCopy(const T* x, T* y, int n) {
+  std::memcpy(y, x, n * sizeof(T));
+}
+
+// x shape: (x_len)
+// y shape: (h, x_len)
+template <typename T>
+void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
+  for (int64_t h = 0; h < y_h; ++h) {
+    VCopy(x, y + h * x_len, x_len);
+  }
+}
+
+template <typename T>
+void VRelu(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T>
+inline void VIdentity(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i];
+  }
+}
+
+template <typename T>
+inline void VSquare(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] * x[i];
+  }
+}
+
+template <typename T>
+void VExp(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+void VSigmoid(const T* x, T* y, int n) {
+  // y = 1 / (1 + e^-x)
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+  }
+}
+
+template <typename T>
+void VTanh(const T* x, T* y, int n) {
+  // y = 2 * sigmoid(2x) - 1
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  VSigmoid(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+  }
+}
+
+template <typename T>
+void (*getActFunc(KernelType type))(const T*, T*, int) {  // NOLINT
+  if (type == kVSigmoid) {
+    return VSigmoid<T>;
+  } else if (type == kVRelu) {
+    return VRelu<T>;
+  } else if (type == kVTanh) {
+    return VTanh<T>;
+  } else if (type == kVIdentity) {
+    return VIdentity<T>;
+  }
+  LOG(FATAL) << "Not support type: " << type;
+  return nullptr;
+}
+
+// TODO(TJ): add refer gemm and make LSTM kernels combine as same GRU kernels
+
+// compute ct and ht
+template <typename T>
+void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* wp = reinterpret_cast<const T*>(step->wp);
+  T* checked = reinterpret_cast<T*>(step->checked);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  auto act_cell = getActFunc<T>(attr->act_cell);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  // gates: W_ch, W_ih, W_fh, W_oh
+  if (attr->use_peephole) {
+    VMul(wp, ct_1, checked, d);
+    VMul(wp + d, ct_1, checked + d, d);
+    VAdd(checked, gates + d, gates + d, d2);
+    act_gate(gates + d, gates + d, d2);
+  } else {
+    act_gate(gates + d, gates + d, d3);
+  }
+
+  // C_t = C_t-1 * fgated + cand_gated * igated
+  act_cand(gates, gates, d);
+  VMul(gates, gates + d, gates + d, d);
+  VMul(ct_1, gates + d2, gates + d2, d);
+  VAdd(gates + d, gates + d2, ct, d);
+
+  if (attr->use_peephole) {
+    // get ogated
+    VMul(wp + d2, ct, gates + d, d);
+    VAdd(gates + d, gates + d3, gates + d3, d);
+    act_gate(gates + d3, gates + d3, d);
+  }
+  // H_t = act_cell(C_t) * ogated
+  act_cell(ct, gates + d2, d);
+  VMul(gates + d2, gates + d3, ht, d);
+}
+
+// compute c1 and h1 without c0 or h0
+template <typename T>
+void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  auto act_cell = getActFunc<T>(attr->act_cell);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  /* C_t = igated * cgated*/
+  act_gate(gates + d, gates + d, d);
+  act_cand(gates, gates, d);
+  VMul(gates, gates + d, ct, d);
+  if (attr->use_peephole) {
+    // get outgated, put W_oc * C_t on igated
+    const T* wp = reinterpret_cast<const T*>(step->wp);
+    VMul(wp + d2, ct, gates + d, d);
+    VAdd(gates + d, gates + d3, gates + d3, d);
+  }
+  /* H_t = act_cell(C_t) * ogated */
+  act_gate(gates + d3, gates + d3, d);
+  act_cell(ct, gates + d2, d);
+  VMul(gates + d2, gates + d3, ht, d);
+}
+
+// compute h1 without h0
+template <typename T>
+void GRUH1(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  int d = attr->d;
+  int d2 = d * 2;
+  act_gate(gates, gates, d);
+  act_cand(gates + d2, gates + d2, d);
+  VMul(gates, gates + d2, ht, d);
+}
+
+// compute the first part of GRU: ht = act_gate(r) * ht_1
+template <typename T>
+void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
+  // W: {W_update, W_reset; W_state}
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  act_gate(gates + attr->d, gates + attr->d, attr->d);
+  VMul(ht_1, gates + attr->d, ht, attr->d);
+}
+
+// compute the second part of GRU:
+// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+template <typename T>
+void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  int d = attr->d;
+  T* y = gates + d * 2;
+  act_gate(gates, gates, d);
+  act_cand(y, y, d);
+  // out = zt*ht~ + (1-zt)*ht_1
+  for (int i = 0; i < d; ++i) {
+    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+  }
+}
+
+template <typename T>
+void CRFDecoding(const int seq_len,
+                 const T* x,
+                 const T* w,
+                 T* alpha,
+                 int* track,
+                 int right) {
+  constexpr int state_trans_base_idx = 2;
+  for (int i = 0; i < right; ++i) {
+    alpha[i] = w[i] + x[i];
+  }
+  for (int k = 1; k < seq_len; ++k) {
+    for (int i = 0; i < right; ++i) {
+      T max_score = -std::numeric_limits<T>::max();
+      int max_j = 0;
+      for (int j = 0; j < right; ++j) {
+        T score = alpha[(k - 1) * right + j] +
+                  w[(j + state_trans_base_idx) * right + i];
+        if (score > max_score) {
+          max_score = score;
+          max_j = j;
+        }
+      }
+      alpha[k * right + i] = max_score + x[k * right + i];
+      track[k * right + i] = max_j;
+    }
+  }
+}
+
+template <typename T>
+void LayerNorm(T* x,
+               T* out,
+               T* mean,
+               T* var,
+               const T* scale,
+               const T* bias,
+               int height,
+               const float epsilon,
+               int right) {
+  // get mean
+  for (int i = 0; i < height; i++) {
+    T sum = 0.0;
+    int offset = i * right;
+    for (int j = 0; j < right; j++) {
+      sum += x[offset + j];
+    }
+    mean[i] = sum / right;
+  }
+
+  // get variance
+  for (int i = 0; i < height; i++) {
+    T sum = 0.0;
+    int offset = i * right;
+    for (int j = 0; j < right; j++) {
+      sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]);
+    }
+    var[i] = sum / right;
+  }
+
+  for (int i = 0; i < height; i++) {
+    int offset = i * right;
+    T sqrt_var = std::sqrt(var[i] + (T)epsilon);
+    for (int j = 0; j < right; j++) {
+      out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var;
+    }
+  }
+  if (scale) {
+    for (int i = 0; i < height; i++) {
+      int offset = i * right;
+      for (int j = 0; j < right; j++) {
+        out[offset + j] *= scale[j];
+      }
+    }
+  }
+
+  if (bias) {
+    for (int i = 0; i < height; i++) {
+      int offset = i * right;
+      for (int j = 0; j < right; j++) {
+        out[offset + j] += bias[j];
+      }
+    }
+  }
+}
+
+template <typename T>
+void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
+  int offset = 0;
+  for (int h = 0; h < height; ++h) {
+    for (int w = 0; w < width; ++w) {
+      for (int i = 0; i < 16; ++i) {
+        z[i + offset] = y[i] * x[i + offset];
+      }
+      offset += 16;
+    }
+  }
+}
+
+template <typename T>
+void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
+  for (int w = 0; w < attr->w; ++w) {
+    const T* src = x + w;
+    T* dst = y + w;
+    *dst = static_cast<T>(0);
+    for (int h = 0; h < attr->h; ++h) {
+      *dst = *dst + *src;
+      src += attr->w;
+    }
+  }
+  if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
+    T scalar = static_cast<T>(1);
+    if (attr->type == SeqPoolType::kAvg) {
+      scalar = scalar / static_cast<T>(attr->h);
+    } else {
+      scalar = scalar / std::sqrt(static_cast<T>(attr->h));
+    }
+    VScal<T>(&scalar, y, y, attr->w);
+  }
+}
+
+// A(M,K) * B(K,N) = C(M,N)
+template <typename T>
+void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) {
+  int M = attr->m;
+  int N = attr->n;
+  int K = attr->k;
+  for (int m = 0; m < M; ++m) {
+    const T* pa = A + m * K;
+    T* pc = C + m * N;
+    for (int n = 0; n < N; ++n) {
+      const T* pb = B + n;
+      pc[n] = pa[0] * pb[0];
+      for (int k = 1; k < K; ++k) {
+        pc[n] += pa[k] * pb[k * N];
+      }
+    }
+  }
+}
+
+template <typename T>
+void HMax(const T* x, T* res, int n) {
+  res[0] = x[0];
+  for (int i = 1; i < n; ++i) {
+    res[0] = res[0] < x[i] ? x[i] : res[0];
+  }
+}
+
+template <typename T>
+void HSum(const T* x, T* res, int n) {
+  res[0] = x[0];
+  for (int i = 1; i < n; ++i) {
+    res[0] += x[i];
+  }
+}
+
+template <typename T>
+void StrideASum(const T* x, T* res, int n, int stride) {
+  res[0] = x[0];
+  for (int i = stride; i < n; i += stride) {
+    res[0] += std::abs(x[i]);
+  }
+}
+
+template <typename T>
+void StrideScal(const T* a, const T* x, T* y, int n, int stride) {
+  for (int i = 0; i < n; ++i) {
+    if (i % stride == 0) {
+      y[i] = x[i] * a[0];
+    } else {
+      y[i] = x[i];
+    }
+  }
+}
+
+// y = e^(x - max(x))
+// y = y / sum(y)
+// remain is the product of dimension shapes after the axis dimension
+template <typename T>
+void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
+  for (int i = 0; i < bs; ++i) {
+    T scalar;
+    HMax(x, &scalar, n);
+    scalar = static_cast<T>(0) - scalar;
+    VAddBias(&scalar, x, y, n);  // x - max
+    VExp(y, y, n);
+    if (remain == 1) {
+      HSum(y, &scalar, n);
+      scalar = static_cast<T>(1) / scalar;
+      VScal(&scalar, y, y, n);
+    } else {
+      for (int j = 0; j < remain; j++) {
+        StrideASum(&y[j], &scalar, n, remain);
+        scalar = static_cast<T>(1) / scalar;
+        StrideScal(&scalar, &y[j], &y[j], n - j, remain);
+      }
+    }
+    x += n;
+    y += n;
+  }
+}
+
+// embedding seq pool
+// table is a matrix with (tbl_h, tbl_w)
+// idx is a matrix with (idx_h, idx_w)
+// output is a vector with length tbl_w * idx_w
+template <typename T>
+void EmbSeqPool(const T* table,
+                const int64_t* idx,
+                T* out,
+                const emb_seq_pool_attr_t* attr) {
+  CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
+
+  auto check_idx_value_valid = [&](int64_t i) {
+    CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
+                                         << " i: " << i;
+    CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
+  };
+
+  for (int64_t w = 0; w != attr->index_width; ++w) {
+    check_idx_value_valid(w);
+    std::memcpy(out + w * attr->table_width,
+                table + idx[w] * attr->table_width,
+                attr->table_width * sizeof(T));
+  }
+
+  for (int64_t h = 1; h < attr->index_height; ++h) {
+    for (int64_t w = 0; w < attr->index_width; ++w) {
+      int64_t i = h * attr->index_width + w;
+      check_idx_value_valid(i);
+      VAdd(table + idx[i] * attr->table_width,
+           out + w * attr->table_width,
+           out + w * attr->table_width,
+           attr->table_width);
+    }
+  }
+}
+
+// SGD algorithm:
+// lr is pointor of learning rate scalar
+// param is an input matrix with (param_h, param_w)
+// grad is an input matrix with (grad_h, grad_w), here grad_w == param_w
+// selected_rows is a vectot<int64_t> with size selected_rows_size( <= grad_h )
+// out is an output matrix with (param_h, param_w)
+//
+// support both regular and sparse grad
+// regular SGD: out[:] = param[:] - lr[0] * grad[:];
+// sparse SGD: out[rows[i]][:] = param[rows[i]][:] - lr[0] * grad[i][:]
+//
+// Note: when use sparse SGD, and if out != param,
+// the out rows which are not selected have not beed changed, which maybe empty
+template <typename T>
+void Sgd(const T* lr,
+         const T* param,
+         const T* grad,
+         const int64_t* rows,
+         T* out,
+         const lite::jit::sgd_attr_t* attr) {
+  CHECK_EQ(attr->param_width, attr->grad_width);
+  CHECK_LE(attr->selected_rows_size, attr->grad_height);
+  for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
+    auto h_idx = rows[i];
+    CHECK_LT(h_idx, attr->param_height);
+    CHECK_GE(h_idx, 0);
+    for (int64_t j = 0; j < attr->grad_width; ++j) {
+      out[h_idx * attr->grad_width + j] =
+          param[h_idx * attr->grad_width + j] -
+          lr[0] * grad[i * attr->grad_width + j];
+    }
+  }
+}
+
+#define DECLARE_REFER_KERNEL(name)                                     \
+  template <typename T>                                                \
+  class name##Kernel : public lite::jit::ReferKernel<name##Tuple<T>> { \
+   public:                                                             \
+    name##Kernel() { this->func = name<T>; }                           \
+  }
+
+// const T* x, const T* y, T* z, int n
+DECLARE_REFER_KERNEL(VMul);
+DECLARE_REFER_KERNEL(VAdd);
+DECLARE_REFER_KERNEL(VAddRelu);
+DECLARE_REFER_KERNEL(VSub);
+
+// const T* a, const T* x, T* y, int n
+DECLARE_REFER_KERNEL(VScal);
+DECLARE_REFER_KERNEL(VAddBias);
+
+// const T* a, const T* x, T* y, int n, int stride
+DECLARE_REFER_KERNEL(StrideScal);
+
+// const T* x, T* y, int n
+DECLARE_REFER_KERNEL(VRelu);
+DECLARE_REFER_KERNEL(VIdentity);
+DECLARE_REFER_KERNEL(VExp);
+DECLARE_REFER_KERNEL(VSigmoid);
+DECLARE_REFER_KERNEL(VTanh);
+DECLARE_REFER_KERNEL(VSquare);
+DECLARE_REFER_KERNEL(VCopy);
+
+// lstm_t*, const lstm_attr_t*
+DECLARE_REFER_KERNEL(LSTMCtHt);
+DECLARE_REFER_KERNEL(LSTMC1H1);
+
+// gru_t*, const gru_attr_t*
+DECLARE_REFER_KERNEL(GRUH1);
+DECLARE_REFER_KERNEL(GRUHtPart1);
+DECLARE_REFER_KERNEL(GRUHtPart2);
+
+DECLARE_REFER_KERNEL(HMax);
+DECLARE_REFER_KERNEL(HSum);
+
+DECLARE_REFER_KERNEL(StrideASum);
+
+// others
+DECLARE_REFER_KERNEL(CRFDecoding);
+DECLARE_REFER_KERNEL(LayerNorm);
+DECLARE_REFER_KERNEL(NCHW16CMulNC);
+DECLARE_REFER_KERNEL(SeqPool);
+DECLARE_REFER_KERNEL(MatMul);
+DECLARE_REFER_KERNEL(Softmax);
+DECLARE_REFER_KERNEL(EmbSeqPool);
+DECLARE_REFER_KERNEL(Sgd);
+DECLARE_REFER_KERNEL(VBroadcast);
+
+#undef DECLARE_REFER_KERNEL
+
+}  // namespace refer
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/jit/registry.h b/lite/backends/loongarch/jit/registry.h
new file mode 100644
index 00000000000..beb704c3476
--- /dev/null
+++ b/lite/backends/loongarch/jit/registry.h
@@ -0,0 +1,178 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>  // for std::move
+#include "lite/backends/loongarch/cpu_info.h"
+#include "lite/backends/loongarch/jit/kernel_base.h"
+#include "lite/backends/loongarch/jit/kernel_pool.h"
+#include "lite/backends/loongarch/legacy_place.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace jit {
+
+// make_unique is supported since c++14
+template <typename T, typename... Args>
+inline std::unique_ptr<T> make_unique(Args&&... args) {
+  static_assert(!std::is_array<T>::value, "T must not be array");
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <typename Pool,
+          typename PlaceType,
+          bool IsEnd,
+          size_t I,
+          typename... KernelImpls>
+struct JitKernelRegistrarFunctor;
+
+template <typename Pool, typename PlaceType, size_t I, typename... KernelImpls>
+struct JitKernelRegistrarFunctor<Pool, PlaceType, true, I, KernelImpls...> {
+  void operator()(KernelType kt) const {}
+};
+
+template <typename Pool, typename PlaceType, size_t I, typename... KernelImpls>
+struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
+  using KERNEL_IMPL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelImpls...>>::type;
+
+  void operator()(KernelType kt) const {
+    KernelKey kkey(kt, PlaceType());
+    Pool::Instance().Insert(kkey,
+                            std::move(make_unique<const KERNEL_IMPL_TYPE>()));
+    constexpr auto size = std::tuple_size<std::tuple<KernelImpls...>>::value;
+    JitKernelRegistrarFunctor<Pool,
+                              PlaceType,
+                              I + 1 == size,
+                              I + 1,
+                              KernelImpls...>
+        func;
+    func(kt);
+  }
+};
+
+template <typename Pool, typename PlaceType, typename... KernelImpls>
+class JitKernelRegistrar {
+ public:
+  explicit JitKernelRegistrar(KernelType kt) {
+    JitKernelRegistrarFunctor<Pool, PlaceType, false, 0, KernelImpls...> func;
+    func(kt);
+  }
+  void Touch() {}
+};
+
+#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg)         \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+// Refer always on CPUPlace
+#define REGISTER_JITKERNEL_REFER_LITE(kernel_type, ...)             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                    \
+      __reg_litejitkernel_##kernel_type##_refer_CPUPlace,           \
+      "REGISTER_KERNEL_REFER must be called in global namespace");  \
+  static ::paddle::lite::jit::JitKernelRegistrar<                   \
+      ::paddle::lite::jit::ReferKernelPool,                         \
+      ::paddle::lite::fluid::CPUPlace,                              \
+      __VA_ARGS__>                                                  \
+      __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_(       \
+          ::paddle::lite::jit::KernelType::kernel_type);            \
+  int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {     \
+    __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \
+    return 0;                                                       \
+  }
+
+// kernel_type: should be in paddle::lite::jit::KernelType
+// place_type: should be one of CPUPlace and GPUPlace in paddle::platform
+#define REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, place_type, ...)    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                              \
+      __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type,         \
+      "REGISTER_KERNEL_MORE_LITE must be called in global namespace");        \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();         \
+  static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \
+      UNUSED = LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();       \
+  static ::paddle::lite::jit::JitKernelRegistrar<                             \
+      ::paddle::lite::jit::KernelPool,                                        \
+      ::paddle::lite::fluid::place_type,                                      \
+      __VA_ARGS__>                                                            \
+      __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_(   \
+          ::paddle::lite::jit::KernelType::kernel_type);                      \
+  int LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \
+    __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_      \
+        .Touch();                                                             \
+    return 0;                                                                 \
+  }
+
+#define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \
+  REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
+
+#define REGISTER_GPUKERNEL_MORE_LITE(kernel_type, impl_type, ...) \
+  REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
+
+#define REGISTER_JITKERNEL_GEN_LITE(kernel_type, ...)                    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                         \
+      __reg_litejitkernel_gen_##kernel_type##_CPUPlace_,                 \
+      "REGISTER_JITKERNEL_GEN_LITE must be called in global namespace"); \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();    \
+  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =            \
+      LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
+  static ::paddle::lite::jit::JitKernelRegistrar<                        \
+      ::paddle::lite::jit::JitCodeCreatorPool,                           \
+      ::paddle::lite::fluid::CPUPlace,                                   \
+      __VA_ARGS__>                                                       \
+      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(              \
+          ::paddle::lite::jit::KernelType::kernel_type);                 \
+  int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {            \
+    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();        \
+    return 0;                                                            \
+  }
+
+#define USE_JITKERNEL_GEN_LITE(kernel_type)                           \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                      \
+      __reg_litejitkernel_gen_##kernel_type##_CPUPlace_,              \
+      "USE_JITKERNEL_GEN_LITE must be called in global namespace");   \
+  extern int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
+  static int use_litejitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
+      LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
+
+#define USE_JITKERNEL_REFER_LITE(kernel_type)                           \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                        \
+      __reg_litejitkernel_##kernel_type##_refer_CPUPlace_,              \
+      "USE_JITKERNEL_REFER_LITE must be called in global namespace");   \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
+  static int use_litejitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
+      LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
+
+#define USE_KERNEL_MORE_LITE(kernel_type, impl_type, place_type)             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                             \
+      __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
+      "USE_JITKERNEL_MORE_LITE must be called in global namespace");         \
+  extern int                                                                 \
+      LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
+  static int use_litejitkernel_##kernel_type##_##impl_type##_##place_type##_ \
+      UNUSED =                                                               \
+          LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
+
+#define USE_JITKERNEL_MORE_LITE(kernel_type, impl_type) \
+  USE_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace)
+
+}  // namespace jit
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/legacy_place.h b/lite/backends/loongarch/legacy_place.h
new file mode 100644
index 00000000000..8f96bbd7da9
--- /dev/null
+++ b/lite/backends/loongarch/legacy_place.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+// Fake the legacy Place.
+struct Place {
+  int which() const { return 1; }  // fake
+};
+
+struct CPUPlace : Place {};
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/activation.cc b/lite/backends/loongarch/math/activation.cc
new file mode 100644
index 00000000000..912ccc5aeda
--- /dev/null
+++ b/lite/backends/loongarch/math/activation.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/activation.h"
+
+#include "lite/backends/loongarch/xxl.h"
+#ifdef __loongarch_asx
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+#endif
+
+#include <algorithm>
+#include <cmath>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <>
+void mish(const float* din, float* dout, int size, float threshold) {
+#ifdef __loongarch_asx
+  int cnt = size >> 3;
+  int remain = size & 7;
+#else
+  int cnt = size >> 2;
+  int remain = size & 3;
+#endif
+
+#ifdef __loongarch_asx
+  __m256 vthreshold = lasx_set1_f32(threshold);
+  __m256 vone = lasx_set1_f32(1.f);
+  __m256 vtwo = lasx_set1_f32(2.f);
+  __m256 minus_vthreshold = lasx_set1_f32(-threshold);
+  for (int i = 0; i < cnt; i++) {
+    __m256 vx0 = lasx_loadu_f32(din);
+
+    __m256 gt_0 = lasx_xvfcmp_slt_s(vthreshold, vx0);
+    __m256 lt_0 = lasx_xvfcmp_slt_s(vx0, minus_vthreshold);
+
+    __m256 vleftx0 = exp256_ps(vx0);
+
+    __m256 vmiddle_temp = lasx_add_f32(vleftx0, vone);  // ln(1+e^x)
+    __m256 vmiddlex0 = log256_ps(vmiddle_temp);
+
+    __m256 sp0 = lasx_blendv_f32(vmiddlex0, vx0, gt_0);
+    sp0 = lasx_blendv_f32(sp0, vleftx0, lt_0);
+
+    __m256 exp_sp0 = exp256_ps(lasx_mul_f32(sp0, vtwo));
+
+    __m256 exp_sum0 = lasx_add_f32(exp_sp0, vone);
+    __m256 exp_diff0 = lasx_sub_f32(exp_sp0, vone);
+    __m256 tanh = lasx_div_f32(exp_diff0, exp_sum0);
+    __m256 res0 = lasx_mul_f32(vx0, tanh);
+
+    lasx_storeu_f32(dout, res0);
+    dout += 8;
+    din += 8;
+  }
+
+#else
+
+  __m128 vthreshold = lsx_set1_f32(threshold);
+  __m128 vone = lsx_set1_f32(1.f);
+  __m128 minus_vthreshold = lsx_set1_f32(-threshold);
+  for (int i = 0; i < cnt; i++) {
+    __m128 vx0 = lsx_loadu_f32(din);
+
+    __m128 gt_0 = lsx_cmpgt_f32(vx0, vthreshold);
+    __m128 lt_0 = lsx_cmplt_f32(vx0, minus_vthreshold);
+
+    __m128 data0 = lsx_min_f32(vx0, lsx_set1_f32(70.00008f));
+    data0 = lsx_max_f32(data0, lsx_set1_f32(-70.00008f));
+
+    __m128 vleftx0;
+    vleftx0[0] = std::exp(data0[0]);
+    vleftx0[1] = std::exp(data0[1]);
+    vleftx0[2] = std::exp(data0[2]);
+    vleftx0[3] = std::exp(data0[3]);
+
+    __m128 vmiddlex0;
+    vmiddlex0[0] = std::log1p(vleftx0[0]);
+    vmiddlex0[1] = std::log1p(vleftx0[1]);
+    vmiddlex0[2] = std::log1p(vleftx0[2]);
+    vmiddlex0[3] = std::log1p(vleftx0[3]);
+
+    __m128 sp0 = lsx_blendv_f32(vmiddlex0, vx0, gt_0);
+    sp0 = lsx_blendv_f32(sp0, vleftx0, lt_0);
+
+    sp0 = lsx_min_f32(sp0, lsx_set1_f32(70.00008f));
+    sp0 = lsx_max_f32(sp0, lsx_set1_f32(-70.00008f));
+
+    __m128 exp_sp0;
+    exp_sp0[0] = std::exp(2 * sp0[0]);
+    exp_sp0[1] = std::exp(2 * sp0[1]);
+    exp_sp0[2] = std::exp(2 * sp0[2]);
+    exp_sp0[3] = std::exp(2 * sp0[3]);
+
+    __m128 exp_sum0 = lsx_add_f32(exp_sp0, vone);
+    __m128 exp_diff0 = lsx_sub_f32(exp_sp0, vone);
+    __m128 tanh = lsx_div_f32(exp_diff0, exp_sum0);
+    __m128 res0 = lsx_mul_f32(vx0, tanh);
+
+    lsx_storeu_f32(dout, res0);
+    dout += 4;
+    din += 4;
+  }
+
+#endif
+
+  for (int i = 0; i < remain; i++) {
+    float x = din[i];
+    float sp = 0.0f;
+    if (threshold > 0 && x > threshold)
+      sp = x;
+    else if (threshold > 0 && x < -threshold)
+      sp = std::exp(x);
+    else
+      sp = std::log1p(std::exp(x));
+    dout[i] = x * std::tanh(sp);
+  }
+}
+
+template <>
+void hard_swish(const float* din,
+                float* dout,
+                int size,
+                float scale,
+                float offset,
+                float threshold) {
+#ifdef __loongarch_asx
+  int cnt = size >> 5;
+  int remain = size & 31;
+  __m256 vec_zero = lasx_set1_f32(0.f);
+  __m256 vec_scale = lasx_set1_f32(1.0 / scale);
+  __m256 vec_threshold = lasx_set1_f32(threshold);
+  __m256 vec_offset = lasx_set1_f32(offset);
+#else
+  int cnt = size >> 4;
+  int remain = size & 15;
+#endif
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+  __m128 vec_scale_128 = lsx_set1_f32(1.0 / scale);
+  __m128 vec_threshold_128 = lsx_set1_f32(threshold);
+  __m128 vec_offset_128 = lsx_set1_f32(offset);
+  int cnt_4 = remain >> 2;
+  int rem_4 = remain & 3;
+  for (int i = 0; i < cnt; i++) {
+#ifdef __loongarch_asx
+    __m256 vin0 = lasx_loadu_f32(din);
+    __m256 vin1 = lasx_loadu_f32(din + 8);
+    __m256 vin2 = lasx_loadu_f32(din + 16);
+    __m256 vin3 = lasx_loadu_f32(din + 24);
+    __m256 vadd0 = lasx_add_f32(vin0, vec_offset);
+    __m256 vadd1 = lasx_add_f32(vin1, vec_offset);
+    __m256 vadd2 = lasx_add_f32(vin2, vec_offset);
+    __m256 vadd3 = lasx_add_f32(vin3, vec_offset);
+    __m256 vsum0 = lasx_mul_f32(vin0, vec_scale);
+    __m256 vsum1 = lasx_mul_f32(vin1, vec_scale);
+    __m256 vsum2 = lasx_mul_f32(vin2, vec_scale);
+    __m256 vsum3 = lasx_mul_f32(vin3, vec_scale);
+    __m256 vres0 = lasx_min_f32(lasx_max_f32(vadd0, vec_zero), vec_threshold);
+    __m256 vres1 = lasx_min_f32(lasx_max_f32(vadd1, vec_zero), vec_threshold);
+    __m256 vres2 = lasx_min_f32(lasx_max_f32(vadd2, vec_zero), vec_threshold);
+    __m256 vres3 = lasx_min_f32(lasx_max_f32(vadd3, vec_zero), vec_threshold);
+    lasx_storeu_f32(dout, lasx_mul_f32(vres0, vsum0));
+    lasx_storeu_f32(dout + 8, lasx_mul_f32(vres1, vsum1));
+    lasx_storeu_f32(dout + 16, lasx_mul_f32(vres2, vsum2));
+    lasx_storeu_f32(dout + 24, lasx_mul_f32(vres3, vsum3));
+    din += 32;
+    dout += 32;
+#else
+    __m128 vin0 = lsx_loadu_f32(din);
+    __m128 vin1 = lsx_loadu_f32(din + 4);
+    __m128 vin2 = lsx_loadu_f32(din + 8);
+    __m128 vin3 = lsx_loadu_f32(din + 12);
+    __m128 vadd0 = lsx_add_f32(vin0, vec_offset_128);
+    __m128 vadd1 = lsx_add_f32(vin1, vec_offset_128);
+    __m128 vadd2 = lsx_add_f32(vin2, vec_offset_128);
+    __m128 vadd3 = lsx_add_f32(vin3, vec_offset_128);
+    __m128 vsum0 = lsx_mul_f32(vin0, vec_scale_128);
+    __m128 vsum1 = lsx_mul_f32(vin1, vec_scale_128);
+    __m128 vsum2 = lsx_mul_f32(vin2, vec_scale_128);
+    __m128 vsum3 = lsx_mul_f32(vin3, vec_scale_128);
+    __m128 vres0 =
+        lsx_min_f32(lsx_max_f32(vadd0, vec_zero_128), vec_threshold_128);
+    __m128 vres1 =
+        lsx_min_f32(lsx_max_f32(vadd1, vec_zero_128), vec_threshold_128);
+    __m128 vres2 =
+        lsx_min_f32(lsx_max_f32(vadd2, vec_zero_128), vec_threshold_128);
+    __m128 vres3 =
+        lsx_min_f32(lsx_max_f32(vadd3, vec_zero_128), vec_threshold_128);
+    lsx_storeu_f32(dout, lsx_mul_f32(vres0, vsum0));
+    lsx_storeu_f32(dout + 4, lsx_mul_f32(vres1, vsum1));
+    lsx_storeu_f32(dout + 8, lsx_mul_f32(vres2, vsum2));
+    lsx_storeu_f32(dout + 12, lsx_mul_f32(vres3, vsum3));
+    din += 16;
+    dout += 16;
+#endif
+  }
+  for (int i = 0; i < cnt_4; i++) {
+    __m128 vin0 = lsx_loadu_f32(din);
+    __m128 vadd0 = lsx_add_f32(vin0, vec_offset_128);
+    __m128 vsum0 = lsx_mul_f32(vin0, vec_scale_128);
+    __m128 vres0 =
+        lsx_min_f32(lsx_max_f32(vadd0, vec_zero_128), vec_threshold_128);
+    lsx_storeu_f32(dout, lsx_mul_f32(vres0, vsum0));
+    din += 4;
+    dout += 4;
+  }
+  for (int i = 0; i < rem_4; i++) {
+    dout[0] =
+        std::min(std::max(0.f, din[0] + offset), threshold) * din[0] / scale;
+    dout++;
+    din++;
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/activation.h b/lite/backends/loongarch/math/activation.h
new file mode 100644
index 00000000000..667450dc621
--- /dev/null
+++ b/lite/backends/loongarch/math/activation.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+void mish(const T* din, T* dout, int size, float threshold);
+
+template <typename T>
+void hard_swish(const T* din,
+                T* dout,
+                int size,
+                float scale,
+                float offset,
+                float threshold);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/activation_functions.cc b/lite/backends/loongarch/math/activation_functions.cc
new file mode 100644
index 00000000000..fd96a61f71b
--- /dev/null
+++ b/lite/backends/loongarch/math/activation_functions.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __loongarch_asx
+
+#include "lite/backends/loongarch/math/activation_functions.h"
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+namespace detail {
+
+namespace forward {
+namespace lasx {
+__m256 Relu(const __m256 a) {
+  __m256 tmp = lasx_set1_f32(0.0f);
+  return lasx_max_f32(a, tmp);
+}
+
+__m256 Sigmoid(const __m256 a) {
+  __m256 max = lasx_set1_f32(SIGMOID_THRESHOLD_MAX);
+  __m256 min = lasx_set1_f32(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = lasx_max_f32(a, min);
+  tmp = lasx_min_f32(tmp, max);
+  tmp = lasx_sub_f32(lasx_set1_f32(0.0f), tmp);
+  tmp = lite::loongarch::math::exp256_ps(tmp);
+  tmp = lasx_add_f32(lasx_set1_f32(1.0f), tmp);
+  tmp = lasx_div_f32(lasx_set1_f32(1.0f), tmp);
+  return tmp;
+}
+
+__m256 Tanh(const __m256 a) {
+  __m256 max = lasx_set1_f32(EXP_MAX_INPUT);
+  __m256 tmp = lasx_mul_f32(lasx_set1_f32(-2.0f), a);
+  tmp = lasx_min_f32(tmp, max);
+  tmp = lite::loongarch::math::exp256_ps(tmp);
+  return lasx_sub_f32(lasx_div_f32(lasx_set1_f32(2.0f),
+                                     lasx_add_f32(lasx_set1_f32(1.0f), tmp)),
+                       lasx_set1_f32(1.0f));
+}
+
+__m256 Identity(const __m256 a) { return a; }
+
+}  // namespace lasx
+}  // namespace forward
+
+namespace backward {
+namespace lasx {
+__m256 Relu(const __m256 a, const __m256 b) {
+  return lasx_mul_f32(
+      a,
+      lasx_and_f32(lasx_xvfcmp_slt_s(lasx_set1_f32(0.0f), b),
+                    lasx_set1_f32(1.0f)));
+}
+
+__m256 Sigmoid(const __m256 a, const __m256 b) {
+  return lasx_mul_f32(lasx_mul_f32(a, b),
+                       lasx_sub_f32(lasx_set1_f32(1.0f), b));
+}
+
+__m256 Tanh(const __m256 a, const __m256 b) {
+  return lasx_mul_f32(
+      a, lasx_sub_f32(lasx_set1_f32(1.0f), lasx_mul_f32(b, b)));
+}
+
+__m256 Identity(const __m256 a, const __m256 b) { return a; }
+}  // namespace lasx
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
+
+#endif
diff --git a/lite/backends/loongarch/math/activation_functions.h b/lite/backends/loongarch/math/activation_functions.h
new file mode 100644
index 00000000000..f214cba09a5
--- /dev/null
+++ b/lite/backends/loongarch/math/activation_functions.h
@@ -0,0 +1,193 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <string>
+#include "lite/backends/loongarch/cpu_info.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+namespace detail {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+inline ActivationType GetActivationType(const std::string &type) {
+  if (type == "sigmoid") {
+    return ActivationType::kSigmoid;
+  } else if (type == "relu") {
+    return ActivationType::kReLU;
+  } else if (type == "tanh") {
+    return ActivationType::kTanh;
+  } else if (type == "identity" || type == "") {
+    return ActivationType::kIdentity;
+  }
+  LOG(ERROR) << "Not support type " << type;
+  return ActivationType();
+}
+
+namespace forward {
+
+template <typename T>
+T Identity(const T a) {
+  return a;
+}
+
+template <typename T>
+T Relu(const T a) {
+  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
+}
+
+template <typename T>
+T Sigmoid(const T a) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  T tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
+}
+
+template <typename T>
+T Tanh(const T a) {
+  T tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+T Identity(const T a, const T b) {
+  return a;
+}
+
+template <typename T>
+T Relu(const T a, const T b) {
+  return a * (b > 0.0 ? 1.0 : 0.0);
+}
+
+template <typename T>
+T Sigmoid(const T a, const T b) {
+  return a * b * (1.0 - b);
+}
+
+template <typename T>
+T Tanh(const T a, const T b) {
+  return a * (1.0 - b * b);
+}
+
+}  // namespace backward
+
+template <typename T>
+struct Active {
+  typedef T (*Act)(T);
+  typedef T (*ActGrad)(T, T);
+};
+
+static Active<float>::Act kActFloat[] = {&forward::Sigmoid<float>,
+                                         &forward::Relu<float>,
+                                         &forward::Tanh<float>,
+                                         &forward::Identity<float>};
+
+static Active<float>::ActGrad kActGradFloat[] = {&backward::Sigmoid<float>,
+                                                 &backward::Relu<float>,
+                                                 &backward::Tanh<float>,
+                                                 &backward::Identity<float>};
+
+static Active<double>::Act kActDouble[] = {&forward::Sigmoid<double>,
+                                           &forward::Relu<double>,
+                                           &forward::Tanh<double>,
+                                           &forward::Identity<double>};
+
+static Active<double>::ActGrad kActGradDouble[] = {&backward::Sigmoid<double>,
+                                                   &backward::Relu<double>,
+                                                   &backward::Tanh<double>,
+                                                   &backward::Identity<double>};
+
+namespace forward {
+inline float activation(float a, int index) { return kActFloat[index](a); }
+
+inline double activation(double a, int index) { return kActDouble[index](a); }
+
+}  // namespace forward
+
+namespace backward {
+inline float activation(float a, float b, int index) {
+  return kActGradFloat[index](a, b);
+}
+
+inline double activation(double a, double b, int index) {
+  return kActGradDouble[index](a, b);
+}
+}  // namespace backward
+
+#ifdef __loongarch_asx
+namespace forward {
+namespace lasx {
+__m256 Relu(const __m256 a);
+__m256 Sigmoid(const __m256 a);
+__m256 Tanh(const __m256 a);
+__m256 Identity(const __m256 a);
+}  // namespace lasx
+}  // namespace forward
+
+namespace backward {
+namespace lasx {
+__m256 Relu(const __m256 a, const __m256 b);
+__m256 Sigmoid(const __m256 a, const __m256 b);
+__m256 Tanh(const __m256 a, const __m256 b);
+__m256 Identity(const __m256 a, const __m256 b);
+}  // namespace lasx
+}  // namespace backward
+
+static Active<__m256>::Act kActLasx[] = {&forward::lasx::Sigmoid,
+                                         &forward::lasx::Relu,
+                                         &forward::lasx::Tanh,
+                                         &forward::lasx::Identity};
+
+static Active<__m256>::ActGrad kActGradLasx[] = {&backward::lasx::Sigmoid,
+                                                 &backward::lasx::Relu,
+                                                 &backward::lasx::Tanh,
+                                                 &backward::lasx::Identity};
+
+namespace forward {
+inline __m256 activation(__m256 a, int index) { return kActLasx[index](a); }
+}  // namespace forward
+
+namespace backward {
+inline __m256 activation(__m256 a, __m256 b, int index) {
+  return kActGradLasx[index](a, b);
+}
+}  // namespace backward
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/blas.cc b/lite/backends/loongarch/math/blas.cc
new file mode 100644
index 00000000000..3b0613aa12a
--- /dev/null
+++ b/lite/backends/loongarch/math/blas.cc
@@ -0,0 +1,57 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/blas.h"
+
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
+                                     int num_flatten_cols,
+                                     bool trans) {
+  CHECK_GT(tensor_dim.size(), 1u);
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = tensor_dim.Vectorize();
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/blas.h b/lite/backends/loongarch/math/blas.h
new file mode 100644
index 00000000000..02cf08a38ff
--- /dev/null
+++ b/lite/backends/loongarch/math/blas.h
@@ -0,0 +1,342 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+/**
+ * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose
+ * flag
+ *
+ * @param tensor_dim: The dimension of the tensor. The rank of this dimension
+ * must larger than 1.
+ *
+ * @param num_flatten_cols:  Reshape a tensor to a matrix. The matrix's first
+ * dimension(column length) will be the product of tensor's first `num_col_dims`
+ * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the
+ * batch_size of descriptor.
+ *
+ * @param trans: True if the matrix is transposed.
+ */
+extern MatDescriptor CreateMatrixDescriptor(const lite::DDimLite& tensor_dim,
+                                            int num_flatten_cols,
+                                            bool trans);
+
+template <lite::TargetType Target>
+class Blas {
+ public:
+  explicit Blas(const lite::Context<Target>& context) : context_(context) {}
+
+  template <typename T>
+  void GEMM(CBLAS_TRANSPOSE transA,
+            CBLAS_TRANSPOSE transB,
+            int M,
+            int N,
+            int K,
+            T alpha,
+            const T* A,
+            const T* B,
+            T beta,
+            T* C) const;
+
+  template <typename T>
+  void GEMM(bool transA,
+            bool transB,
+            int M,
+            int N,
+            int K,
+            T alpha,
+            const T* A,
+            int lda,
+            const T* B,
+            int ldb,
+            T beta,
+            T* C,
+            int ldc) const;
+
+  template <typename T>
+  void GEMM(CBLAS_TRANSPOSE transA,
+            CBLAS_TRANSPOSE transB,
+            int M,
+            int N,
+            int K,
+            T alpha,
+            const T* A,
+            int lda,
+            const T* B,
+            int ldb,
+            T beta,
+            T* C,
+            int ldc) const;
+
+  template <typename T>
+  void MatMul(const int M,
+              const int N,
+              const int K,
+              const T* A,
+              const T* B,
+              T* C) const;
+
+  template <typename T>
+  void MatMul(const lite::TensorLite& mat_a,
+              bool trans_a,
+              const lite::TensorLite& mat_b,
+              bool trans_b,
+              T alpha,
+              lite::TensorLite* mat_out,
+              T beta) const;
+
+  template <typename T>
+  void MatMul(const lite::TensorLite& mat_a,
+              bool trans_a,
+              const lite::TensorLite& mat_b,
+              bool trans_b,
+              lite::TensorLite* mat_out) const {
+    MatMul(mat_a,
+           trans_a,
+           mat_b,
+           trans_b,
+           static_cast<T>(1.0),
+           mat_out,
+           static_cast<T>(0.0));
+  }
+
+  template <typename T>
+  void MatMul(const lite::TensorLite& mat_a,
+              const lite::TensorLite& mat_b,
+              lite::TensorLite* mat_out) const {
+    this->template MatMul<T>(mat_a, false, mat_b, false, mat_out);
+  }
+
+  template <typename T>
+  void AXPY(int n, T alpha, const T* x, T* y) const;
+
+  template <typename T>
+  void VADD(int n, const T* x, const T* y, T* z) const;
+
+  template <typename T>
+  void VMUL(int n, const T* x, const T* y, T* z) const;
+
+  template <typename T>
+  void VCOPY(int n, const T* x, T* y) const;
+
+  template <typename T>
+  void VEXP(int n, const T* x, T* y) const;
+
+  template <typename T>
+  void VSQUARE(int n, const T* x, T* y) const;
+
+  template <typename T>
+  void VPOW(int n, const T* x, T alpha, T* y) const;
+
+  template <typename T>
+  void GEMV(bool trans_a,
+            int M,
+            int N,
+            T alpha,
+            const T* A,
+            const T* B,
+            T beta,
+            T* C) const;
+
+  template <typename T>
+  T DOT(int n, const T* x, const T* y) const;
+
+  template <typename T>
+  void SCAL(int n, const T a, T* x) const;
+
+  template <typename T>
+  T ASUM(int n, T* x, int inc) const;
+
+  template <typename T>
+  void BatchedGEMM(CBLAS_TRANSPOSE transA,
+                   CBLAS_TRANSPOSE transB,
+                   int M,
+                   int N,
+                   int K,
+                   T alpha,
+                   const T* A,
+                   const T* B,
+                   T beta,
+                   T* C,
+                   int batchCount,
+                   int64_t strideA,
+                   int64_t strideB) const;
+
+  template <typename T>
+  void MatMul(const lite::TensorLite& mat_a,
+              const MatDescriptor& dim_a,
+              const lite::TensorLite& mat_b,
+              const MatDescriptor& dim_b,
+              T alpha,
+              lite::TensorLite* mat_out,
+              T beta) const;
+
+  template <typename T>
+  void VINV(int n, const T* a, T* y) const;
+
+  template <typename T>
+  void VMERF(int n, const T* a, T* y, int64_t mode) const;
+
+ private:
+  const lite::Context<Target>& context_;
+};
+
+template <lite::TargetType Target, typename T>
+class BlasT : private Blas<Target> {
+ public:
+  using Blas<Target>::Blas;
+
+  template <typename... ARGS>
+  void GEMM(ARGS... args) const {
+    Base()->template GEMM<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void MatMul(ARGS... args) const {
+    Base()->template MatMul<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void AXPY(ARGS... args) const {
+    Base()->template AXPY<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VADD(ARGS... args) const {
+    Base()->template VADD<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VMUL(ARGS... args) const {
+    Base()->template VMUL<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VCOPY(ARGS... args) const {
+    Base()->template VCOPY<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VEXP(ARGS... args) const {
+    Base()->template VEXP<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VSQUARE(ARGS... args) const {
+    Base()->template VSQUARE<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VPOW(ARGS... args) const {
+    Base()->template VPOW<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void GEMV(ARGS... args) const {
+    Base()->template GEMV<T>(args...);
+  }
+
+  template <typename... ARGS>
+  T DOT(ARGS... args) const {
+    return Base()->template DOT<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void SCAL(ARGS... args) const {
+    Base()->template SCAL<T>(args...);
+  }
+
+  template <typename... ARGS>
+  T ASUM(ARGS... args) const {
+    return Base()->template ASUM<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void BatchedGEMM(ARGS... args) const {
+    Base()->template BatchedGEMM<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VINV(ARGS... args) const {
+    Base()->template VINV<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VMERF(ARGS... args) const {
+    Base()->template VMERF<T>(args...);
+  }
+
+ private:
+  const Blas<Target>* Base() const {
+    return static_cast<const Blas<Target>*>(this);
+  }
+};
+
+// template <lite::TargetType Target, typename T>
+// inline BlasT<Target, T> GetBlas(
+//    const framework::ExecutionContext& exe_ctx) {
+//  return BlasT<DeviceContext, T>(
+//      exe_ctx.template device_context<DeviceContext>());
+//}
+
+template <lite::TargetType Target, typename T>
+inline BlasT<Target, T> GetBlas(const lite::Context<Target>& dev_ctx) {
+  return BlasT<Target, T>(dev_ctx);
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
+
+#include "lite/backends/loongarch/math/blas_impl.h"
diff --git a/lite/backends/loongarch/math/blas_impl.h b/lite/backends/loongarch/math/blas_impl.h
new file mode 100644
index 00000000000..13646b77bb5
--- /dev/null
+++ b/lite/backends/loongarch/math/blas_impl.h
@@ -0,0 +1,452 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cmath>
+#include <limits>
+#include <vector>
+#include "lite/backends/loongarch/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+struct CBlas;
+
+template <>
+struct CBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_sgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    cblas_saxpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    cblas_scopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    cblas_sgemv(args...);
+  }
+};
+
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_dgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    cblas_daxpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    cblas_dcopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    cblas_dgemv(args...);
+  }
+};
+
+template <>
+struct CBlas<lite::fluid::float16> {
+  static void GEMM(...) { LOG(FATAL) << "float16 GEMM not supported on CPU"; }
+  static void VMUL(...) { LOG(FATAL) << "float16 VMUL not supported on CPU"; }
+  static void VEXP(...) { LOG(FATAL) << "float16 VEXP not supported on CPU"; }
+  static void VSQUARE(...) {
+    LOG(FATAL) << "float16 VSQUARE not supported on CPU";
+  }
+  static void VPOW(...) { LOG(FATAL) << "float16 VPOW not supported on CPU"; }
+  static void DOT(...) { LOG(FATAL) << "float16 DOT not supported on CPU"; };
+  static void SCAL(...) { LOG(FATAL) << "float16 SCAL not supported on CPU"; };
+  static void ASUM(...) { LOG(FATAL) << "float16 ASUM not supported on CPU"; };
+};
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T *A,
+                                        const T *B,
+                                        T beta,
+                                        T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CBlas<T>::GEMM(CblasRowMajor,
+                 transA,
+                 transB,
+                 M,
+                 N,
+                 K,
+                 alpha,
+                 A,
+                 lda,
+                 B,
+                 ldb,
+                 beta,
+                 C,
+                 ldc);
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T *A,
+                                        int lda,
+                                        const T *B,
+                                        int ldb,
+                                        T beta,
+                                        T *C,
+                                        int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor,
+                 transA == false ? CblasNoTrans : CblasTrans,
+                 transB == false ? CblasNoTrans : CblasTrans,
+                 M,
+                 N,
+                 K,
+                 alpha,
+                 A,
+                 lda,
+                 B,
+                 ldb,
+                 beta,
+                 C,
+                 ldc);
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T *A,
+                                        int lda,
+                                        const T *B,
+                                        int ldb,
+                                        T beta,
+                                        T *C,
+                                        int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor,
+                 transA,
+                 transB,
+                 M,
+                 N,
+                 K,
+                 alpha,
+                 A,
+                 lda,
+                 B,
+                 ldb,
+                 beta,
+                 C,
+                 ldc);
+}
+
+template <lite::TargetType Target>
+template <typename T>
+void Blas<Target>::MatMul(const lite::Tensor &mat_a,
+                          bool trans_a,
+                          const lite::Tensor &mat_b,
+                          bool trans_b,
+                          T alpha,
+                          lite::Tensor *mat_out,
+                          T beta) const {
+  auto dim_a = mat_a.dims();
+  auto dim_b = mat_b.dims();
+  auto dim_out = mat_out->dims();
+  CHECK(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2)
+      << "The input and output of matmul be matrix";
+  // CHECK(
+  //    mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target())
+  //    << "The targets of matrices must be same";
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = !trans_a ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans;
+
+  this->GEMM(transA,
+             transB,
+             M,
+             N,
+             K,
+             alpha,
+             mat_a.data<T>(),
+             mat_b.data<T>(),
+             beta,
+             mat_out->template mutable_data<T>());
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::AXPY(int n,
+                                        T alpha,
+                                        const T *x,
+                                        T *y) const {
+  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::VCOPY(int n, const T *x, T *y) const {
+  CBlas<T>::VCOPY(n, x, 1, y, 1);
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::VADD(int n,
+                                        const T *x,
+                                        const T *y,
+                                        T *z) const {
+  this->template VCOPY<T>(n, y, z);
+  this->template AXPY<T>(n, 1., x, z);
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::VMUL(int n,
+                                        const T *x,
+                                        const T *y,
+                                        T *z) const {
+  // try to find if openblas support vmul
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::VEXP(int n, const T *x, T *y) const {
+  // try to find if openblas support vexp
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::VSQUARE(int n, const T *x, T *y) const {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] * x[i];
+  }
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::VPOW(int n, const T *x, T a, T *y) const {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::pow(x[i], a);
+  }
+}
+
+template <>
+template <typename T>
+T Blas<lite::TargetType::kLoongArch>::DOT(int n, const T *x, const T *y) const {
+  // try to find if openblas support cblas_dot
+  T sum = 0;
+  for (int i = 0; i < n; ++i) {
+    sum += x[i] * y[i];
+  }
+  return sum;
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::SCAL(int n, const T a, T *x) const {
+  // try to find if openblas support cblas_scal
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+
+template <>
+template <typename T>
+T Blas<lite::TargetType::kLoongArch>::ASUM(int n, T *x, int inc) const {
+  auto sum = static_cast<T>(0.0);
+  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
+  for (int c = 0; c < n; ++c) {
+    sum += x[c];
+  }
+  return sum;
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        T alpha,
+                                        const T *A,
+                                        const T *B,
+                                        T beta,
+                                        T *C) const {
+  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
+  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               T alpha,
+                                               const T *A,
+                                               const T *B,
+                                               T beta,
+                                               T *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+  for (int k = 0; k < batchCount; ++k) {
+    auto *Ak = &A[k * strideA];
+    auto *Bk = &B[k * strideB];
+    auto *Ck = &C[k * M * N];
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
+  }
+}
+
+template <lite::TargetType Target>
+template <typename T>
+void Blas<Target>::MatMul(
+    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
+  this->template GEMM<T>(CblasRowMajor,
+                         CblasNoTrans,
+                         CblasNoTrans,
+                         M,
+                         N,
+                         K,
+                         static_cast<T>(1),
+                         A,
+                         K,
+                         B,
+                         N,
+                         static_cast<T>(0),
+                         C,
+                         N);
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::MatMul(
+    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
+  CBlas<T>::GEMM(CblasRowMajor,
+                 CblasNoTrans,
+                 CblasNoTrans,
+                 M,
+                 N,
+                 K,
+                 static_cast<T>(1),
+                 A,
+                 K,
+                 B,
+                 N,
+                 static_cast<T>(0),
+                 C,
+                 N);
+}
+
+template <lite::TargetType Target>
+template <typename T>
+void Blas<Target>::MatMul(const lite::Tensor &mat_a,
+                          const MatDescriptor &dim_a,
+                          const lite::Tensor &mat_b,
+                          const MatDescriptor &dim_b,
+                          T alpha,
+                          lite::Tensor *mat_out,
+                          T beta) const {
+  CHECK_EQ(dim_a.width_, dim_b.height_);
+  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
+  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
+    this->template GEMM<T>(transA,
+                           transB,
+                           dim_a.height_,
+                           dim_b.width_,
+                           dim_a.width_,
+                           alpha,
+                           mat_a.data<T>(),
+                           mat_b.data<T>(),
+                           beta,
+                           mat_out->template mutable_data<T>());
+  } else {
+    CHECK(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+          dim_b.batch_size_ == 0);
+    this->template BatchedGEMM<T>(
+        transA,
+        transB,
+        dim_a.height_,
+        dim_b.width_,
+        dim_a.width_,
+        alpha,
+        mat_a.data<T>(),
+        mat_b.data<T>(),
+        beta,
+        mat_out->template mutable_data<T>(),
+        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
+        dim_a.stride_,
+        dim_b.stride_);
+  }
+}
+template <lite::TargetType Target>
+template <typename T>
+void Blas<Target>::VINV(int n, const T *a, T *y) const {
+  for (int i = 0; i < n; ++i) {
+    y[i] = 1.0 / a[i];
+  }
+}
+
+template <>
+template <typename T>
+void Blas<lite::TargetType::kLoongArch>::VMERF(int n,
+                                         const T *a,
+                                         T *y,
+                                         int64_t mode) const {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::erf(a[i]);
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/box_coder.cc b/lite/backends/loongarch/math/box_coder.cc
new file mode 100644
index 00000000000..795d18adbdb
--- /dev/null
+++ b/lite/backends/loongarch/math/box_coder.cc
@@ -0,0 +1,158 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/box_coder.h"
+#include <string>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void encode_center_size(const int64_t row,  // N
+                        const int64_t col,  // M
+                        const int64_t len,  // 4
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output) {
+#pragma omp parallel for collapse(2)
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      size_t offset = i * col * len + j * len;
+      float prior_box_width = prior_box_data[j * len + 2] -
+                              prior_box_data[j * len] + (normalized == false);
+      float prior_box_height = prior_box_data[j * len + 3] -
+                               prior_box_data[j * len + 1] +
+                               (normalized == false);
+      float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[j * len + 1] + prior_box_height / 2;
+
+      float target_box_center_x =
+          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+      float target_box_center_y =
+          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+      float target_box_width = target_box_data[i * len + 2] -
+                               target_box_data[i * len] + (normalized == false);
+      float target_box_height = target_box_data[i * len + 3] -
+                                target_box_data[i * len + 1] +
+                                (normalized == false);
+
+      output[offset] =
+          (target_box_center_x - prior_box_center_x) / prior_box_width;
+      output[offset + 1] =
+          (target_box_center_y - prior_box_center_y) / prior_box_height;
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width));
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height));
+    }
+  }
+
+  if (prior_box_var_data) {
+#pragma omp parallel for collapse(3)
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        for (int64_t k = 0; k < len; ++k) {
+          size_t offset = i * col * len + j * len;
+          int prior_var_offset = j * len;
+          output[offset + k] /= prior_box_var_data[prior_var_offset + k];
+        }
+      }
+    }
+  } else if (!(variance.empty())) {
+#pragma omp parallel for collapse(3)
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        for (int64_t k = 0; k < len; ++k) {
+          size_t offset = i * col * len + j * len;
+          output[offset + k] /= variance[k];
+        }
+      }
+    }
+  }
+}
+
+void decode_center_size(const int axis,
+                        const int var_size,
+                        const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output) {
+#pragma omp parallel for collapse(2)
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      float var_data[4] = {1., 1., 1., 1.};
+      float* var_ptr = var_data;
+      size_t offset = i * col * len + j * len;
+      int prior_box_offset = axis == 0 ? j * len : i * len;
+
+      float prior_box_width = prior_box_data[prior_box_offset + 2] -
+                              prior_box_data[prior_box_offset] +
+                              (normalized == false);
+      float prior_box_height = prior_box_data[prior_box_offset + 3] -
+                               prior_box_data[prior_box_offset + 1] +
+                               (normalized == false);
+      float prior_box_center_x =
+          prior_box_data[prior_box_offset] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
+
+      float target_box_center_x = 0, target_box_center_y = 0;
+      float target_box_width = 0, target_box_height = 0;
+      int prior_var_offset = axis == 0 ? j * len : i * len;
+      if (var_size == 2) {
+        std::memcpy(
+            var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
+      } else if (var_size == 1) {
+        var_ptr = const_cast<float*>(variance.data());
+      }
+      float box_var_x = *var_ptr;
+      float box_var_y = *(var_ptr + 1);
+      float box_var_w = *(var_ptr + 2);
+      float box_var_h = *(var_ptr + 3);
+
+      target_box_center_x =
+          box_var_x * target_box_data[offset] * prior_box_width +
+          prior_box_center_x;
+      target_box_center_y =
+          box_var_y * target_box_data[offset + 1] * prior_box_height +
+          prior_box_center_y;
+      target_box_width =
+          std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
+      target_box_height =
+          std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
+
+      output[offset] = target_box_center_x - target_box_width / 2;
+      output[offset + 1] = target_box_center_y - target_box_height / 2;
+      output[offset + 2] =
+          target_box_center_x + target_box_width / 2 - (normalized == false);
+      output[offset + 3] =
+          target_box_center_y + target_box_height / 2 - (normalized == false);
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/box_coder.h b/lite/backends/loongarch/math/box_coder.h
new file mode 100644
index 00000000000..ca0d3317905
--- /dev/null
+++ b/lite/backends/loongarch/math/box_coder.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void encode_center_size(const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output);
+
+void decode_center_size(const int axis,
+                        const int var_size,
+                        const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/calib.cc b/lite/backends/loongarch/math/calib.cc
new file mode 100644
index 00000000000..828aca56aca
--- /dev/null
+++ b/lite/backends/loongarch/math/calib.cc
@@ -0,0 +1,249 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/calib.h"
+#include <string.h>
+#include <vector>
+#include "lite/backends/loongarch/xxl.h"
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+#include "lite/backends/loongarch/math/saturate.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+void fp32_to_int8(const float* din,
+                  int8_t* dout,
+                  const float* scale,
+                  int axis_size,
+                  int64_t outer_size,
+                  int64_t inner_size) {
+#ifdef __loongarch_asx
+  int cnt = inner_size >> 5;
+  int remain = inner_size & 31;
+#else
+  int cnt = inner_size >> 4;
+  int remain = inner_size & 15;
+#endif
+  int rem_cnt = remain >> 3;
+  int rem_rem = remain & 7;
+  int64_t loop_size = outer_size * axis_size;
+#pragma omp parallel for
+  for (int j = 0; j < loop_size; ++j) {
+    float inv_scale = 1.f / scale[j % axis_size];
+#ifdef __loongarch_asx
+    __m256 vzero_l = lasx_set1_f32(-127.f);
+    __m256 vscale_l = lasx_set1_f32(inv_scale);
+#endif
+    __m128 vzero = lsx_set1_f32(-127.f);
+    __m128 vscale = lsx_set1_f32(inv_scale);
+    const float* din_c = din + j * inner_size;
+    int8_t* dout_c = dout + j * inner_size;
+#ifdef __loongarch_asx
+    for (int i = 0; i < cnt; i++) {
+      __m256 vin0 = lasx_loadu_f32(din_c);
+      __m256 vin1 = lasx_loadu_f32(din_c + 8);
+      __m256 vin2 = lasx_loadu_f32(din_c + 16);
+      __m256 vin3 = lasx_loadu_f32(din_c + 24);
+      __m256 vout0 = lasx_mul_f32(vin0, vscale_l);
+      __m256 vout1 = lasx_mul_f32(vin1, vscale_l);
+      __m256 vout2 = lasx_mul_f32(vin2, vscale_l);
+      __m256 vout3 = lasx_mul_f32(vin3, vscale_l);
+      vin0 = lasx_blendv_f32(
+          vzero_l, vout0, lasx_xvfcmp_slt_s(vzero_l, vout0));
+      vin1 = lasx_blendv_f32(
+          vzero_l, vout1, lasx_xvfcmp_slt_s(vzero_l, vout1));
+      vin2 = lasx_blendv_f32(
+          vzero_l, vout2, lasx_xvfcmp_slt_s(vzero_l, vout2));
+      vin3 = lasx_blendv_f32(
+          vzero_l, vout3, lasx_xvfcmp_slt_s(vzero_l, vout3));
+      // fp32->int32
+      __m256i vres0 = lasx_cvtf32_i32(vin0);
+      __m256i vres1 = lasx_cvtf32_i32(vin1);
+      __m256i vres2 = lasx_cvtf32_i32(vin2);
+      __m256i vres3 = lasx_cvtf32_i32(vin3);
+      __m256i vres0_16 = lasx_packs_i32(vres0, vres0);
+      __m256i vres1_16 = lasx_packs_i32(vres1, vres1);
+      __m256i vres2_16 = lasx_packs_i32(vres2, vres2);
+      __m256i vres3_16 = lasx_packs_i32(vres3, vres3);
+      __m256i vres0_8 = lasx_packs_i16(vres0_16, vres0_16);
+      __m256i vres1_8 = lasx_packs_i16(vres1_16, vres1_16);
+      __m256i vres2_8 = lasx_packs_i16(vres2_16, vres2_16);
+      __m256i vres3_8 = lasx_packs_i16(vres3_16, vres3_16);
+      *(reinterpret_cast<int*>(dout_c)) = (reinterpret_cast<int*>(&vres0_8))[0];
+      *(reinterpret_cast<int*>(dout_c + 4)) =
+          (reinterpret_cast<int*>(&vres0_8))[4];
+      *(reinterpret_cast<int*>(dout_c + 8)) =
+          (reinterpret_cast<int*>(&vres1_8))[0];
+      *(reinterpret_cast<int*>(dout_c + 12)) =
+          (reinterpret_cast<int*>(&vres1_8))[4];
+      *(reinterpret_cast<int*>(dout_c + 16)) =
+          (reinterpret_cast<int*>(&vres2_8))[0];
+      *(reinterpret_cast<int*>(dout_c + 20)) =
+          (reinterpret_cast<int*>(&vres2_8))[4];
+      *(reinterpret_cast<int*>(dout_c + 24)) =
+          (reinterpret_cast<int*>(&vres3_8))[0];
+      *(reinterpret_cast<int*>(dout_c + 28)) =
+          (reinterpret_cast<int*>(&vres3_8))[4];
+      din_c += 32;
+      dout_c += 32;
+    }
+#else
+    for (int i = 0; i < cnt; i++) {
+      __m128 vin0 = lsx_loadu_f32(din_c);
+      __m128 vin1 = lsx_loadu_f32(din_c + 4);
+      __m128 vin2 = lsx_loadu_f32(din_c + 8);
+      __m128 vin3 = lsx_loadu_f32(din_c + 12);
+      __m128 vout0 = lsx_mul_f32(vin0, vscale);
+      __m128 vout1 = lsx_mul_f32(vin1, vscale);
+      __m128 vout2 = lsx_mul_f32(vin2, vscale);
+      __m128 vout3 = lsx_mul_f32(vin3, vscale);
+      vin0 = lsx_blendv_f32(vzero, vout0, lsx_vfcmp_slt_s(vzero, vout0));
+      vin1 = lsx_blendv_f32(vzero, vout1, lsx_vfcmp_slt_s(vzero, vout1));
+      vin2 = lsx_blendv_f32(vzero, vout2, lsx_vfcmp_slt_s(vzero, vout2));
+      vin3 = lsx_blendv_f32(vzero, vout3, lsx_vfcmp_slt_s(vzero, vout3));
+      // fp32->int32
+      __m128i vres0 = lsx_cvtf32_i32(vin0);
+      __m128i vres1 = lsx_cvtf32_i32(vin1);
+      __m128i vres2 = lsx_cvtf32_i32(vin2);
+      __m128i vres3 = lsx_cvtf32_i32(vin3);
+      __m128i vres0_16 = lsx_packs_i32(vres0, vres0);
+      __m128i vres1_16 = lsx_packs_i32(vres1, vres1);
+      __m128i vres2_16 = lsx_packs_i32(vres2, vres2);
+      __m128i vres3_16 = lsx_packs_i32(vres3, vres3);
+      __m128i vres0_8 = lsx_packs_i16(vres0_16, vres0_16);
+      __m128i vres1_8 = lsx_packs_i16(vres1_16, vres1_16);
+      __m128i vres2_8 = lsx_packs_i16(vres2_16, vres2_16);
+      __m128i vres3_8 = lsx_packs_i16(vres3_16, vres3_16);
+      *(reinterpret_cast<int*>(dout_c)) = lsx_extract_i32(vres0_8, 0);
+      *(reinterpret_cast<int*>(dout_c + 4)) = lsx_extract_i32(vres1_8, 0);
+      *(reinterpret_cast<int*>(dout_c + 8)) = lsx_extract_i32(vres2_8, 0);
+      *(reinterpret_cast<int*>(dout_c + 12)) = lsx_extract_i32(vres3_8, 0);
+      din_c += 16;
+      dout_c += 16;
+    }
+#endif
+    for (int i = 0; i < rem_cnt; i++) {
+      __m128 vin0 = lsx_loadu_f32(din_c);
+      __m128 vout0 = lsx_mul_f32(vin0, vscale);
+      vin0 = lsx_blendv_f32(vzero, vout0, lsx_vfcmp_slt_s(vzero, vout0));
+      // fp32->int32
+      __m128i vres0 = lsx_cvtf32_i32(vin0);
+      __m128i vres0_16 = lsx_packs_i32(vres0, vres0);
+      __m128i vres0_8 = lsx_packs_i16(vres0_16, vres0_16);
+      *(reinterpret_cast<int*>(dout_c)) = lsx_extract_i32(vres0_8, 0);
+      din_c += 8;
+      dout_c += 8;
+    }
+    for (int i = 0; i < rem_rem; ++i) {
+      dout_c[i] = saturate_cast<int8_t>(roundf(inv_scale * din_c[i]));
+      dout_c[i] = dout_c[i] < -127 ? -127 : dout_c[i];
+    }
+  }
+}
+
+void int8_to_fp32(const int8_t* in,
+                  float* out,
+                  const float* scale,
+                  int axis_size,
+                  int64_t outer_size,
+                  int64_t inner_size) {
+#ifdef __loongarch_asx
+  int cnt = inner_size >> 5;
+  int remain = inner_size & 31;
+#else
+  int cnt = inner_size >> 4;
+  int remain = inner_size & 15;
+#endif
+  int rem_cnt = remain >> 2;
+  int rem_rem = remain & 3;
+  int64_t loop_size = axis_size * outer_size;
+#pragma omp parallel for
+  for (int64_t n = 0; n < loop_size; ++n) {
+    float in_scale = scale[n % axis_size];
+    const int8_t* din_c = in + n * inner_size;
+    float* dout_c = out + n * inner_size;
+#ifdef __loongarch_asx
+    __m256 vscale_l = lasx_set1_f32(in_scale);
+#endif
+    __m128 vscale = lsx_set1_f32(in_scale);
+
+#ifdef __loongarch_asx
+    for (int i = 0; i < cnt; i++) {
+      __m128i vin0 = lsx_loadu_epi8(din_c);
+      __m128i vin1 = lsx_loadu_epi8(din_c + 8);
+      __m128i vin2 = lsx_loadu_epi8(din_c + 16);
+      __m128i vin3 = lsx_loadu_epi8(din_c + 24);
+      // 8bits x 16 -> 32bits x 8
+      __m256i v00 = lasx_cvti8_i32(vin0);
+      __m256i v01 = lasx_cvti8_i32(vin1);
+      __m256i v02 = lasx_cvti8_i32(vin2);
+      __m256i v03 = lasx_cvti8_i32(vin3);
+      // int32 -> fp32
+      __m256 vout0 = lasx_mul_f32(lasx_cvti32_f32(v00), vscale_l);
+      __m256 vout1 = lasx_mul_f32(lasx_cvti32_f32(v01), vscale_l);
+      __m256 vout2 = lasx_mul_f32(lasx_cvti32_f32(v02), vscale_l);
+      __m256 vout3 = lasx_mul_f32(lasx_cvti32_f32(v03), vscale_l);
+      lasx_storeu_f32(dout_c, vout0);
+      lasx_storeu_f32(dout_c + 8, vout1);
+      lasx_storeu_f32(dout_c + 16, vout2);
+      lasx_storeu_f32(dout_c + 24, vout3);
+      din_c += 32;
+      dout_c += 32;
+    }
+#else
+    for (int i = 0; i < cnt; i++) {
+      __m128i vin0 = lsx_loadu_epi8(din_c);
+      __m128i vin1 = lsx_loadu_epi8(din_c + 4);
+      __m128i vin2 = lsx_loadu_epi8(din_c + 8);
+      __m128i vin3 = lsx_loadu_epi8(din_c + 12);
+      // 8bits x 16 -> 32bits x 4
+      __m128i v00 = lsx_cvti8_i32(vin0);
+      __m128i v01 = lsx_cvti8_i32(vin1);
+      __m128i v02 = lsx_cvti8_i32(vin2);
+      __m128i v03 = lsx_cvti8_i32(vin3);
+      // int32 -> fp32
+      __m128 vout0 = lsx_mul_f32(lsx_cvti32_f32(v00), vscale);
+      __m128 vout1 = lsx_mul_f32(lsx_cvti32_f32(v01), vscale);
+      __m128 vout2 = lsx_mul_f32(lsx_cvti32_f32(v02), vscale);
+      __m128 vout3 = lsx_mul_f32(lsx_cvti32_f32(v03), vscale);
+      lsx_storeu_f32(dout_c, vout0);
+      lsx_storeu_f32(dout_c + 4, vout1);
+      lsx_storeu_f32(dout_c + 8, vout2);
+      lsx_storeu_f32(dout_c + 12, vout3);
+      din_c += 16;
+      dout_c += 16;
+    }
+#endif
+    for (int i = 0; i < rem_cnt; i++) {
+      __m128i vin0 = lsx_loadu_epi8(din_c);
+      // 8bits x 16 -> 32bits x 4
+      __m128i v00 = lsx_cvti8_i32(vin0);
+      // int32 -> fp32
+      __m128 vout0 = lsx_mul_f32(lsx_cvti32_f32(v00), vscale);
+      lsx_storeu_f32(dout_c, vout0);
+
+      din_c += 4;
+      dout_c += 4;
+    }
+    for (int i = 0; i < rem_rem; ++i) {
+      dout_c[i] = in_scale * din_c[i];
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/calib.h b/lite/backends/loongarch/math/calib.h
new file mode 100644
index 00000000000..7f206d523bf
--- /dev/null
+++ b/lite/backends/loongarch/math/calib.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+#include "lite/core/target_wrapper.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+void fp32_to_int8(const float* din,
+                  int8_t* dout,
+                  const float* scale,
+                  int axis_size,
+                  int64_t outer_size,
+                  int64_t inner_size);
+
+void int8_to_fp32(const int8_t* in,
+                  float* out,
+                  const float* scale,
+                  int axis_size,
+                  int64_t outer_size,
+                  int64_t inner_size);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/clip.cc b/lite/backends/loongarch/math/clip.cc
new file mode 100644
index 00000000000..e277d882e65
--- /dev/null
+++ b/lite/backends/loongarch/math/clip.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/clip.h"
+#include "lite/backends/loongarch/xxl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+template <>
+void clip<float>(
+    const float* din, float* dout, const int num, float max_, float min_) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int rem_cnt = remain >> 2;
+  int rem_rem = remain & 3;
+  float* ptr_out = dout;
+  const float* ptr_in = din;
+#ifdef __loongarch_asx
+  __m256 max_256 = lasx_set1_f32(max_);
+  __m256 min_256 = lasx_set1_f32(min_);
+#endif
+  __m128 vmax = lsx_set1_f32(max_);
+  __m128 vmin = lsx_set1_f32(min_);
+  for (int i = 0; i < cnt; i++) {
+#ifdef __loongarch_asx
+    __m256 vin0 = lasx_loadu_f32(ptr_in);
+    __m256 vin1 = lasx_loadu_f32(ptr_in + 8);
+    vin0 = lasx_min_f32(lasx_max_f32(vin0, min_256), max_256);
+    vin1 = lasx_min_f32(lasx_max_f32(vin1, min_256), max_256);
+    lasx_storeu_f32(ptr_out, vin0);
+    lasx_storeu_f32(ptr_out + 8, vin1);
+#else
+    __m128 vin0 = lsx_loadu_f32(ptr_in);
+    __m128 vin1 = lsx_loadu_f32(ptr_in + 4);
+    __m128 vin2 = lsx_loadu_f32(ptr_in + 8);
+    __m128 vin3 = lsx_loadu_f32(ptr_in + 12);
+
+    vin0 = lsx_min_f32(lsx_max_f32(vin0, vmin), vmax);
+    vin1 = lsx_min_f32(lsx_max_f32(vin1, vmin), vmax);
+    vin2 = lsx_min_f32(lsx_max_f32(vin2, vmin), vmax);
+    vin3 = lsx_min_f32(lsx_max_f32(vin3, vmin), vmax);
+
+    lsx_storeu_f32(ptr_out, vin0);
+    lsx_storeu_f32(ptr_out + 4, vin1);
+    lsx_storeu_f32(ptr_out + 8, vin2);
+    lsx_storeu_f32(ptr_out + 12, vin3);
+#endif
+    ptr_in += 16;
+    ptr_out += 16;
+  }
+  for (int i = 0; i < rem_cnt; i++) {
+    __m128 vin0 = lsx_loadu_f32(ptr_in);
+    vin0 = lsx_min_f32(lsx_max_f32(vin0, vmin), vmax);
+    lsx_storeu_f32(ptr_out, vin0);
+    ptr_in += 4;
+    ptr_out += 4;
+  }
+  for (int i = 0; i < rem_rem; i++) {
+    float tmp = ptr_in[0] > min_ ? ptr_in[0] : min_;
+    ptr_out[0] = tmp < max_ ? tmp : max_;
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+} /* namespace math */
+} /* namespace loongarch */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/backends/loongarch/math/clip.h b/lite/backends/loongarch/math/clip.h
new file mode 100644
index 00000000000..e28eb9f65ce
--- /dev/null
+++ b/lite/backends/loongarch/math/clip.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+void clip(const T* din, T* dout, const int num, T max_, T min_);
+
+} /* namespace math */
+} /* namespace loongarch */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/backends/loongarch/math/common/conv_utils.cc b/lite/backends/loongarch/math/common/conv_utils.cc
new file mode 100644
index 00000000000..1c46202e447
--- /dev/null
+++ b/lite/backends/loongarch/math/common/conv_utils.cc
@@ -0,0 +1,1578 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/common/conv_utils.h"
+#include <algorithm>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+// tranpose [chout, chin, wh, ww] to [chout/block,chin,wh,ww,block]
+// dout space should be allocated before calling conv_trans_weights_numc
+void conv_trans_weights_numc(const float* din,
+                             float* dout,  // dout has been expanded
+                             int chout,
+                             int chin,
+                             int wh,
+                             int ww,
+                             int block) {
+  // dout is [chout_expand / block , chin, wh, ww, block]
+  int chout_expand = (chout + block - 1) / block * block;
+  memset(dout, 0.f, sizeof(float) * chout_expand * chin * wh * ww);
+
+  const float* from_address = din;
+  int wchwb = chin * wh * ww * block;
+  int whwb = wh * ww * block;
+  int wwb = ww * block;
+
+  for (int wn_i = 0; wn_i < chout; wn_i++) {
+    for (int wc_i = 0; wc_i < chin; wc_i++) {
+      for (int wh_i = 0; wh_i < wh; wh_i++) {
+        for (int ww_i = 0; ww_i < ww; ww_i++) {
+          int dst_index = wn_i / block * wchwb + wc_i * whwb + wh_i * wwb +
+                          ww_i * block + wn_i % block;
+          dout[dst_index] = *from_address;
+          from_address++;
+        }
+      }
+    }
+  }
+}
+
+// tranpose [chout,chin,wh,ww] to [chout/block,wh,ww,chin,block]
+// this function is different from conv_trans_weights_numc just
+// in that we make chw->hwc
+void conv_trans_weights_numc_c3(const float* din,
+                                float* dout,
+                                int chout,
+                                int chin,
+                                int wh,
+                                int ww,
+                                int block) {
+  CHECK_EQ(chin, 3);
+  int chout_expand = (chout + block - 1) / block * block;
+  memset(
+      dout, 0, sizeof(float) * chout_expand / block * wh * ww * chin * block);
+
+  const float* from_address = din;
+  for (int wn_i = 0; wn_i < chout; wn_i++) {
+    for (int wc_i = 0; wc_i < chin; wc_i++) {  // chin=3!
+      for (int wh_i = 0; wh_i < wh; wh_i++) {
+        for (int ww_i = 0; ww_i < ww; ww_i++) {
+          int dst_index = wn_i / block * wh * ww * chin * block +
+                          wh_i * ww * chin * block + ww_i * chin * block +
+                          wc_i * block + wn_i % block;
+          dout[dst_index] = *from_address;
+          from_address++;
+        }
+      }
+    }
+  }
+}
+
+#if __loongarch_asx
+// function: input-4x8, output-8x4
+static inline void transpose4x8_ps(__m256& row0,  // NOLINT
+                                   __m256& row1,  // NOLINT
+                                   __m256& row2,  // NOLINT
+                                   __m256& row3   // NOLINT
+                                   ) {
+  // vtmp0=a0b0a1b1a4b4a5b5
+  __m256 vtmp0 = lasx_unpacklo_f32(row0, row1);
+  // vtmp1=a2b2a3b3a6b6a7b7
+  __m256 vtmp1 = lasx_unpackhi_f32(row0, row1);
+  // vtmp2=c0d0c1d1c4d4c5d5
+  __m256 vtmp2 = lasx_unpacklo_f32(row2, row3);
+  // vtmp3=c2d2c3d3c6d6c7d7
+  __m256 vtmp3 = lasx_unpackhi_f32(row2, row3);
+  // vres0=a0b0c0d0a4b4c4d4
+  __m256 vres0 = lasx_shuffle_f32(vtmp0, vtmp2, 0x44);  // 0xaa=[01,00,01,00]
+  // vres1=a1b1c1d1a5b5c5d5
+  __m256 vres1 = lasx_shuffle_f32(vtmp0, vtmp2, 0xee);  // 0xaa=[11,10,11,10]
+  // vres2=a2b2c2d2a6b6c6d6
+  __m256 vres2 = lasx_shuffle_f32(vtmp1, vtmp3, 0x44);  // 0xaa=[01,00,01,00]
+  // vres3=a3b3c3d3a7b7c7d7
+  __m256 vres3 = lasx_shuffle_f32(vtmp1, vtmp3, 0xee);  // 0xaa=[11,10,11,10]
+  // row0=a0b0c0d0a1b1c1d1
+  row0 = lasx_permute2f128_f32(vres0, vres1, 0x20);
+  // row1=a2b2c2d2a3b3c3d3
+  row1 = lasx_permute2f128_f32(vres2, vres3, 0x20);
+  // row2=a4b4c4d4a5b5c5d5
+  row2 = lasx_permute2f128_f32(vres0, vres1, 0x31);
+  // row3=a6b6c6d6a7b7c7d7
+  row3 = lasx_permute2f128_f32(vres2, vres3, 0x31);
+}
+#endif
+
+// input  [bs, ic, ih, iw] => [bs, ic/8, ih, iw, 8]
+// filter [oc, 01, ih, iw] => [01, ic/8, ih, iw, 8] for depthwise
+void pack8_m256(lite::Tensor* input,
+                lite::Tensor* output,
+                const int channel_num,
+                const bool is_filter) {
+  int batch_size, input_channel, input_height, input_width;
+  if (is_filter) {
+    batch_size = 1;
+    input_channel = input->dims()[0];
+    input_height = input->dims()[2];
+    input_width = input->dims()[3];
+  } else {
+    batch_size = input->dims()[0];
+    input_channel = input->dims()[1];
+    input_height = input->dims()[2];
+    input_width = input->dims()[3];
+  }
+  CHECK_EQ((input_channel & 7), 0);
+  const float* input_data = input->data<float>();
+
+  const int kernel_size = input_height * input_width;
+  const int pack_step = 8 * kernel_size;
+  const int batch_step = channel_num * pack_step;
+
+  output->Resize({batch_size, channel_num, input_height, input_width, 8});
+  float* output_data = output->mutable_data<float>();
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      const float* input_ptr = input_data + bs * batch_step + ic * pack_step;
+
+      const float* r0 = (input_ptr);
+      const float* r1 = (input_ptr + kernel_size);
+      const float* r2 = (input_ptr + kernel_size * 2);
+      const float* r3 = (input_ptr + kernel_size * 3);
+      const float* r4 = (input_ptr + kernel_size * 4);
+      const float* r5 = (input_ptr + kernel_size * 5);
+      const float* r6 = (input_ptr + kernel_size * 6);
+      const float* r7 = (input_ptr + kernel_size * 7);
+#if __loongarch_asx
+      int loop_num = kernel_size >> 3;
+      int remain = kernel_size & 7;
+#else
+      int remain = kernel_size;
+#endif
+
+#if __loongarch_asx
+      for (; loop_num > 0; loop_num--) {
+        __m256 _row0 = lasx_loadu_f32(r0);
+        __m256 _row1 = lasx_loadu_f32(r1);
+        __m256 _row2 = lasx_loadu_f32(r2);
+        __m256 _row3 = lasx_loadu_f32(r3);
+        __m256 _row4 = lasx_loadu_f32(r4);
+        __m256 _row5 = lasx_loadu_f32(r5);
+        __m256 _row6 = lasx_loadu_f32(r6);
+        __m256 _row7 = lasx_loadu_f32(r7);
+        transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+        lasx_storeu_f32(output_data, _row0);
+        lasx_storeu_f32(output_data + 8, _row1);
+        lasx_storeu_f32(output_data + 16, _row2);
+        lasx_storeu_f32(output_data + 24, _row3);
+        lasx_storeu_f32(output_data + 32, _row4);
+        lasx_storeu_f32(output_data + 40, _row5);
+        lasx_storeu_f32(output_data + 48, _row6);
+        lasx_storeu_f32(output_data + 56, _row7);
+        r0 += 8;
+        r1 += 8;
+        r2 += 8;
+        r3 += 8;
+        r4 += 8;
+        r5 += 8;
+        r6 += 8;
+        r7 += 8;
+        output_data += 64;
+      }
+#endif
+
+      for (; remain > 0; remain--) {
+        output_data[0] = *r0++;
+        output_data[1] = *r1++;
+        output_data[2] = *r2++;
+        output_data[3] = *r3++;
+        output_data[4] = *r4++;
+        output_data[5] = *r5++;
+        output_data[6] = *r6++;
+        output_data[7] = *r7++;
+        output_data += 8;
+      }  // end of remain
+    }    // end of interation_num
+  }      // end of batch_size
+}
+
+// input  [bs, ic, ih, iw] => [bs, ic/4, ih, iw, 4]
+// filter [oc, 01, ih, iw] => [01, ic/4, ih, iw, 4] for depthwise
+void pack4_m128(lite::Tensor* input,
+                lite::Tensor* output,
+                const int channel_num,
+                const bool is_filter) {
+  int batch_size, input_channel, input_height, input_width;
+  if (is_filter) {
+    batch_size = 1;
+    input_channel = input->dims()[0];
+    input_height = input->dims()[2];
+    input_width = input->dims()[3];
+  } else {
+    batch_size = input->dims()[0];
+    input_channel = input->dims()[1];
+    input_height = input->dims()[2];
+    input_width = input->dims()[3];
+  }
+  CHECK_EQ((input_channel & 3), 0);
+  const float* input_data = input->data<float>();
+
+  const int kernel_size = input_height * input_width;
+  const int pack_step = 4 * kernel_size;
+  const int batch_step = channel_num * pack_step;
+
+  output->Resize({batch_size, channel_num, input_height, input_width, 4});
+  float* output_data = output->mutable_data<float>();
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      const float* input_ptr = input_data + bs * batch_step + ic * pack_step;
+
+      const float* r0 = (input_ptr);
+      const float* r1 = (input_ptr + kernel_size);
+      const float* r2 = (input_ptr + kernel_size * 2);
+      const float* r3 = (input_ptr + kernel_size * 3);
+#if __loongarch_sx
+      int loop_num = kernel_size >> 2;
+      int remain = kernel_size & 3;
+#else
+      int remain = kernel_size;
+#endif
+
+#if __loongarch_sx
+      for (; loop_num > 0; loop_num--) {
+        __m128 _row0 = lsx_loadu_f32(r0);
+        __m128 _row1 = lsx_loadu_f32(r1);
+        __m128 _row2 = lsx_loadu_f32(r2);
+        __m128 _row3 = lsx_loadu_f32(r3);
+        LSX_TRANSPOSE4_S(_row0, _row1, _row2, _row3);
+        lsx_storeu_f32(output_data, _row0);
+        lsx_storeu_f32(output_data + 4, _row1);
+        lsx_storeu_f32(output_data + 8, _row2);
+        lsx_storeu_f32(output_data + 12, _row3);
+        r0 += 4;
+        r1 += 4;
+        r2 += 4;
+        r3 += 4;
+        output_data += 16;
+      }
+#endif
+      for (; remain > 0; remain--) {
+        output_data[0] = *r0++;
+        output_data[1] = *r1++;
+        output_data[2] = *r2++;
+        output_data[3] = *r3++;
+        output_data += 4;
+      }
+    }  // end of for ic
+  }    // end of for bs
+}
+
+// output_trans [bs, oc/8, oh, ow, 8] => output [bs, oc, oh, ow]
+void unpack8_m256(lite::Tensor* input, lite::Tensor* output) {
+  CHECK_EQ(input->dims().size(), 5UL);
+  const int batch_size = input->dims()[0];
+  const int channel_num = input->dims()[1];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const float* input_data = input->data<float>();
+
+  const int kernel_size = input_height * input_width;
+  const int pack_step = 8 * kernel_size;
+  const int batch_step = channel_num * pack_step;
+
+  output->Resize({batch_size, channel_num * 8, input_height, input_width});
+  float* output_data = output->mutable_data<float>();
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      const float* r0 = input_data + bs * batch_step + ic * pack_step;
+      float* output_ptr = output_data + bs * batch_step + ic * pack_step;
+
+      float* outptr0 = (output_ptr);
+      float* outptr1 = (output_ptr + kernel_size);
+      float* outptr2 = (output_ptr + kernel_size * 2);
+      float* outptr3 = (output_ptr + kernel_size * 3);
+      float* outptr4 = (output_ptr + kernel_size * 4);
+      float* outptr5 = (output_ptr + kernel_size * 5);
+      float* outptr6 = (output_ptr + kernel_size * 6);
+      float* outptr7 = (output_ptr + kernel_size * 7);
+#if __loongarch_asx
+      int loop_num = kernel_size >> 3;
+      int remain = kernel_size & 7;
+#else
+      int remain = kernel_size;
+#endif
+
+#if __loongarch_asx
+      for (; loop_num > 0; loop_num--) {
+        __m256 _row0 = lasx_loadu_f32(r0);
+        __m256 _row1 = lasx_loadu_f32(r0 + 8);
+        __m256 _row2 = lasx_loadu_f32(r0 + 16);
+        __m256 _row3 = lasx_loadu_f32(r0 + 24);
+        __m256 _row4 = lasx_loadu_f32(r0 + 32);
+        __m256 _row5 = lasx_loadu_f32(r0 + 40);
+        __m256 _row6 = lasx_loadu_f32(r0 + 48);
+        __m256 _row7 = lasx_loadu_f32(r0 + 56);
+        transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+        lasx_storeu_f32(outptr0, _row0);
+        lasx_storeu_f32(outptr1, _row1);
+        lasx_storeu_f32(outptr2, _row2);
+        lasx_storeu_f32(outptr3, _row3);
+        lasx_storeu_f32(outptr4, _row4);
+        lasx_storeu_f32(outptr5, _row5);
+        lasx_storeu_f32(outptr6, _row6);
+        lasx_storeu_f32(outptr7, _row7);
+        r0 += 64;
+        outptr0 += 8;
+        outptr1 += 8;
+        outptr2 += 8;
+        outptr3 += 8;
+        outptr4 += 8;
+        outptr5 += 8;
+        outptr6 += 8;
+        outptr7 += 8;
+      }
+#endif
+      for (; remain > 0; remain--) {
+        *outptr0++ = r0[0];
+        *outptr1++ = r0[1];
+        *outptr2++ = r0[2];
+        *outptr3++ = r0[3];
+        *outptr4++ = r0[4];
+        *outptr5++ = r0[5];
+        *outptr6++ = r0[6];
+        *outptr7++ = r0[7];
+        r0 += 8;
+      }  // end of remain
+    }    // end of iteration_num
+  }      // end of batch_size
+}
+
+// output_trans [bs, oc/4, oh, ow, 4] => output [bs, oc, oh, ow]
+void unpack4_m128(lite::Tensor* input, lite::Tensor* output) {
+  CHECK_EQ(input->dims().size(), 5UL);
+  const int batch_size = input->dims()[0];
+  const int channel_num = input->dims()[1];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const float* input_data = input->data<float>();
+
+  const int kernel_size = input_height * input_width;
+  const int pack_step = 4 * kernel_size;
+  const int batch_step = channel_num * pack_step;
+
+  output->Resize({batch_size, channel_num * 4, input_height, input_width});
+  float* output_data = output->mutable_data<float>();
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      const float* r0 = input_data + bs * batch_step + ic * pack_step;
+      float* output_ptr = output_data + bs * batch_step + ic * pack_step;
+
+      float* outptr0 = (output_ptr);
+      float* outptr1 = (output_ptr + kernel_size);
+      float* outptr2 = (output_ptr + kernel_size * 2);
+      float* outptr3 = (output_ptr + kernel_size * 3);
+
+#if __loongarch_sx
+      int loop_num = kernel_size >> 2;
+      int remain = kernel_size & 3;
+#else
+      int remain = kernel_size;
+#endif
+
+#if __loongarch_sx
+      for (; loop_num > 0; loop_num--) {
+        __m128 _row0 = lsx_loadu_f32(r0);
+        __m128 _row1 = lsx_loadu_f32(r0 + 4);
+        __m128 _row2 = lsx_loadu_f32(r0 + 8);
+        __m128 _row3 = lsx_loadu_f32(r0 + 12);
+        LSX_TRANSPOSE4_S(_row0, _row1, _row2, _row3);
+        lsx_storeu_f32(outptr0, _row0);
+        lsx_storeu_f32(outptr1, _row1);
+        lsx_storeu_f32(outptr2, _row2);
+        lsx_storeu_f32(outptr3, _row3);
+        r0 += 16;
+        outptr0 += 4;
+        outptr1 += 4;
+        outptr2 += 4;
+        outptr3 += 4;
+      }
+#endif
+      for (; remain > 0; remain--) {
+        *outptr0++ = r0[0];
+        *outptr1++ = r0[1];
+        *outptr2++ = r0[2];
+        *outptr3++ = r0[3];
+        r0 += 4;
+      }
+    }  // end of for ic
+  }    // end of for bs
+}
+
+#if __loongarch_asx
+void padding8_m256(lite::Tensor* input,
+                   lite::Tensor* output,
+                   const std::vector<int>& paddings) {
+  CHECK_EQ(paddings.size(), 4UL);
+  int top = paddings[0];
+  int bottom = paddings[1];
+  int left = paddings[2];
+  int right = paddings[3];
+
+  if (top == 0 && bottom == 0 && left == 0 && right == 0) {
+    output->ShareDataWith(*input);
+    return;
+  }
+
+  // input [bs, ic/8, ih, iw, 8]
+  CHECK_EQ(input->dims().size(), 5UL);
+  const int batch_size = input->dims()[0];
+  const int channel_num = input->dims()[1];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const auto* input_data = input->data<float>();
+
+  int out_height = input_height + top + bottom;
+  int out_width = input_width + left + right;
+
+  // output [bs, ic/8, oh, ow, 8]
+  output->Resize({batch_size, channel_num, out_height, out_width, 8});
+  auto output_data = output->mutable_data<float>();
+
+  int top_size = top * out_width;
+  int bottom_size = bottom * out_width;
+
+  __m256 pad_val = lasx_set1_f32(0.f);
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      // fill top
+      for (int y = 0; y < top_size; ++y) {
+        lasx_storeu_f32(output_data, pad_val);
+        output_data += 8;
+      }
+      // fill center
+      for (int y = 0; y < input_height; ++y) {
+        for (int x = 0; x < left; ++x) {
+          lasx_storeu_f32(output_data, pad_val);
+          output_data += 8;
+        }
+        for (int x = 0; x < input_width; ++x) {
+          lasx_storeu_f32(output_data, lasx_loadu_f32(input_data));
+          input_data += 8;
+          output_data += 8;
+        }
+        for (int x = 0; x < right; ++x) {
+          lasx_storeu_f32(output_data, pad_val);
+          output_data += 8;
+        }
+      }
+      // fill bottom
+      for (int y = 0; y < bottom_size; ++y) {
+        lasx_storeu_f32(output_data, pad_val);
+        output_data += 8;
+      }
+    }
+  }
+}
+#endif
+
+#if __loongarch_sx
+void padding4_m128(lite::Tensor* input,
+                   lite::Tensor* output,
+                  const std::vector<int>& paddings) {
+  CHECK_EQ(paddings.size(), 4UL);
+  int top = paddings[0];
+  int bottom = paddings[1];
+  int left = paddings[2];
+  int right = paddings[3];
+
+  if (top == 0 && bottom == 0 && left == 0 && right == 0) {
+    output->ShareDataWith(*input);
+    return;
+  }
+
+  // input [bs, ic/4, ih, iw, 4]
+  CHECK_EQ(input->dims().size(), 5UL);
+  const int batch_size = input->dims()[0];
+  const int channel_num = input->dims()[1];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const auto* input_data = input->data<float>();
+
+  int out_height = input_height + top + bottom;
+  int out_width = input_width + left + right;
+
+  // output [bs, ic/4, oh, ow, 4]
+  output->Resize({batch_size, channel_num, out_height, out_width, 4});
+  auto output_data = output->mutable_data<float>();
+
+  int top_size = top * out_width;
+  int bottom_size = bottom * out_width;
+
+  __m128 pad_val = lsx_set1_f32(0.f);
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      // fill top
+      for (int y = 0; y < top_size; ++y) {
+        lsx_storeu_f32(output_data, pad_val);
+        output_data += 4;
+      }
+      // fill center
+      for (int y = 0; y < input_height; ++y) {
+        for (int x = 0; x < left; ++x) {
+          lsx_storeu_f32(output_data, pad_val);
+          output_data += 4;
+        }
+        for (int x = 0; x < input_width; ++x) {
+          lsx_storeu_f32(output_data, lsx_loadu_f32(input_data));
+          input_data += 4;
+          output_data += 4;
+        }
+        for (int x = 0; x < right; ++x) {
+          lsx_storeu_f32(output_data, pad_val);
+          output_data += 4;
+        }
+      }
+      // fill bottom
+      for (int y = 0; y < bottom_size; ++y) {
+        lsx_storeu_f32(output_data, pad_val);
+        output_data += 4;
+      }
+    }
+  }
+}
+#endif
+
+void padding1_float(lite::Tensor* input,
+                    lite::Tensor* output,
+                    const std::vector<int>& paddings) {
+  CHECK_EQ(paddings.size(), 4UL);
+  int top = paddings[0];
+  int bottom = paddings[1];
+  int left = paddings[2];
+  int right = paddings[3];
+
+  if (top == 0 && bottom == 0 && left == 0 && right == 0) {
+    output->ShareDataWith(*input);
+    return;
+  }
+
+  // input [bs, ic, ih, iw]
+  CHECK_EQ(input->dims().size(), 4UL);
+  int batch_size = input->dims()[0];
+  int input_channel = input->dims()[1];
+  int input_height = input->dims()[2];
+  int input_width = input->dims()[3];
+  const auto* input_data = input->data<float>();
+
+  int out_height = input_height + top + bottom;
+  int out_width = input_width + left + right;
+
+  output->Resize({batch_size, input_channel, out_height, out_width});
+  auto output_data = output->mutable_data<float>();
+
+  int top_size = top * out_width;
+  int bottom_size = bottom * out_width;
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < input_channel; ++ic) {
+      // fill top
+      memset(output_data, 0, sizeof(float) * top_size);
+      output_data += top_size;
+      // fill center
+      for (int y = 0; y < input_height; ++y) {
+        memset(output_data, 0, sizeof(float) * left);
+        output_data += left;
+        memcpy(output_data, input_data, sizeof(float) * input_width);
+        output_data += input_width;
+        input_data += input_width;
+        memset(output_data, 0, sizeof(float) * right);
+        output_data += right;
+      }
+      // fill bottom
+      memset(output_data, 0, sizeof(float) * bottom_size);
+      output_data += bottom_size;
+    }
+  }
+}
+
+#if __loongarch_asx
+void pack_padding8_m256(lite::Tensor* input,
+                        lite::Tensor* output,
+                        const int channel_num,
+                        const std::vector<int>& paddings) {
+  CHECK_EQ(input->dims().size(), 4UL);
+  int batch_size = input->dims()[0];
+  int input_channel = input->dims()[1];
+  int input_height = input->dims()[2];
+  int input_width = input->dims()[3];
+
+  CHECK_EQ((input_channel & 7), 0);
+  const float* input_data = input->data<float>();
+
+  CHECK_EQ(paddings.size(), 4UL);
+  int top = paddings[0];
+  int bottom = paddings[1];
+  int left = paddings[2];
+  int right = paddings[3];
+
+  // in
+  const int kernel_size = input_height * input_width;
+  const int pack_step = 8 * kernel_size;
+  const int batch_step = channel_num * pack_step;
+
+  // out
+  int out_height = input_height + top + bottom;
+  int out_width = input_width + left + right;
+
+  // output [bs, ic/8, oh, ow, 8]
+  output->Resize({batch_size, channel_num, out_height, out_width, 8});
+  auto output_data = output->mutable_data<float>();
+
+  int top_size = top * out_width;
+  int bottom_size = bottom * out_width;
+
+  __m256 pad_val = lasx_set1_f32(0.f);
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      const float* input_ptr = input_data + bs * batch_step + ic * pack_step;
+
+      const float* r0 = (input_ptr);
+      const float* r1 = (input_ptr + kernel_size);
+      const float* r2 = (input_ptr + kernel_size * 2);
+      const float* r3 = (input_ptr + kernel_size * 3);
+      const float* r4 = (input_ptr + kernel_size * 4);
+      const float* r5 = (input_ptr + kernel_size * 5);
+      const float* r6 = (input_ptr + kernel_size * 6);
+      const float* r7 = (input_ptr + kernel_size * 7);
+
+      // fill top
+      for (int y = 0; y < top_size; ++y) {
+        lasx_storeu_f32(output_data, pad_val);
+        output_data += 8;
+      }
+      // fill center
+      for (int y = 0; y < input_height; ++y) {
+        for (int x = 0; x < left; ++x) {
+          lasx_storeu_f32(output_data, pad_val);
+          output_data += 8;
+        }
+        // pack and transpose
+        int pos = 0;
+        for (; pos + 7 < input_width; pos += 8) {
+          __m256 _row0 = lasx_loadu_f32(r0);
+          __m256 _row1 = lasx_loadu_f32(r1);
+          __m256 _row2 = lasx_loadu_f32(r2);
+          __m256 _row3 = lasx_loadu_f32(r3);
+          __m256 _row4 = lasx_loadu_f32(r4);
+          __m256 _row5 = lasx_loadu_f32(r5);
+          __m256 _row6 = lasx_loadu_f32(r6);
+          __m256 _row7 = lasx_loadu_f32(r7);
+          transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+          lasx_storeu_f32(output_data, _row0);
+          lasx_storeu_f32(output_data + 8, _row1);
+          lasx_storeu_f32(output_data + 16, _row2);
+          lasx_storeu_f32(output_data + 24, _row3);
+          lasx_storeu_f32(output_data + 32, _row4);
+          lasx_storeu_f32(output_data + 40, _row5);
+          lasx_storeu_f32(output_data + 48, _row6);
+          lasx_storeu_f32(output_data + 56, _row7);
+          r0 += 8;
+          r1 += 8;
+          r2 += 8;
+          r3 += 8;
+          r4 += 8;
+          r5 += 8;
+          r6 += 8;
+          r7 += 8;
+          output_data += 64;
+        }
+
+        for (; pos < input_width; ++pos) {
+          output_data[0] = *r0++;
+          output_data[1] = *r1++;
+          output_data[2] = *r2++;
+          output_data[3] = *r3++;
+          output_data[4] = *r4++;
+          output_data[5] = *r5++;
+          output_data[6] = *r6++;
+          output_data[7] = *r7++;
+          output_data += 8;
+        }
+
+        for (int x = 0; x < right; ++x) {
+          lasx_storeu_f32(output_data, pad_val);
+          output_data += 8;
+        }
+      }
+      // fill bottom
+      for (int y = 0; y < bottom_size; ++y) {
+        lasx_storeu_f32(output_data, pad_val);
+        output_data += 8;
+      }
+    }
+  }
+}
+#endif
+
+#if __loongarch_asx
+// input  [bs, ic, ih, iw] => [bs, (ic + 7)/8, ih, iw, 8]
+// filter [oc, 01, ih, iw] => [01, (ic + 7)/8, ih, iw, 8] for depthwise
+void packC8_common(const float* din,
+                   float* dout,
+                   const std::vector<int>& pad,
+                   int h_in,
+                   int w_in,
+                   int channel) {
+  int top = pad[0];
+  int bottom = pad[1];
+  int left = pad[2];
+  int right = pad[3];
+  int w_out = (w_in + left + right);
+  int h_out = (h_in + top + bottom);
+  int block_channel = 8;
+  const float* din_init = din;
+  float* dout_init = dout;
+
+  for (int c = 0; c < channel; c += block_channel) {
+    din = din_init + c * h_in * w_in;
+    dout = dout_init + c * w_out * h_out;
+
+    memset(dout, 0, top * w_out * block_channel * sizeof(float));
+    auto dout_block = dout + top * w_out * block_channel;
+
+    for (int i = 0; i < h_in; i++) {
+      float* douth = dout_block + i * w_out * block_channel;
+      const float* dinh = din + i * w_in;
+      memset(douth, 0, left * block_channel * sizeof(float));
+      douth += left * block_channel;
+      int kernel_size = h_in * w_in;
+      auto dinr0 = dinh;
+      auto dinr1 = dinr0 + kernel_size;
+      auto dinr2 = dinr1 + kernel_size;
+      auto dinr3 = dinr2 + kernel_size;
+      auto dinr4 = dinr3 + kernel_size;
+      auto dinr5 = dinr4 + kernel_size;
+      auto dinr6 = dinr5 + kernel_size;
+      auto dinr7 = dinr6 + kernel_size;
+
+      int j = 0;
+      if (c + 7 < channel) {
+        for (; j + 7 < w_in; j += 8) {
+          __m256 _row0 = lasx_loadu_f32(dinr0);
+          __m256 _row1 = lasx_loadu_f32(dinr1);
+          __m256 _row2 = lasx_loadu_f32(dinr2);
+          __m256 _row3 = lasx_loadu_f32(dinr3);
+          __m256 _row4 = lasx_loadu_f32(dinr4);
+          __m256 _row5 = lasx_loadu_f32(dinr5);
+          __m256 _row6 = lasx_loadu_f32(dinr6);
+          __m256 _row7 = lasx_loadu_f32(dinr7);
+          transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+          lasx_storeu_f32(douth, _row0);
+          lasx_storeu_f32(douth + 8, _row1);
+          lasx_storeu_f32(douth + 16, _row2);
+          lasx_storeu_f32(douth + 24, _row3);
+          lasx_storeu_f32(douth + 32, _row4);
+          lasx_storeu_f32(douth + 40, _row5);
+          lasx_storeu_f32(douth + 48, _row6);
+          lasx_storeu_f32(douth + 56, _row7);
+          dinr0 += 8;
+          dinr1 += 8;
+          dinr2 += 8;
+          dinr3 += 8;
+          dinr4 += 8;
+          dinr5 += 8;
+          dinr6 += 8;
+          dinr7 += 8;
+          douth += 64;
+        }
+
+        for (; j < w_in; j++) {
+          douth[0] = *dinr0++;
+          douth[1] = *dinr1++;
+          douth[2] = *dinr2++;
+          douth[3] = *dinr3++;
+          douth[4] = *dinr4++;
+          douth[5] = *dinr5++;
+          douth[6] = *dinr6++;
+          douth[7] = *dinr7++;
+          douth += 8;
+        }
+      } else {
+        __m256 _row0 = lasx_setzero_f32();
+        __m256 _row1 = lasx_setzero_f32();
+        __m256 _row2 = lasx_setzero_f32();
+        __m256 _row3 = lasx_setzero_f32();
+        __m256 _row4 = lasx_setzero_f32();
+        __m256 _row5 = lasx_setzero_f32();
+        __m256 _row6 = lasx_setzero_f32();
+        __m256 _row7 = lasx_setzero_f32();
+        for (; j + 7 < w_in; j += 8) {
+          _row0 = lasx_loadu_f32(dinr0);
+          if (channel - c > 1) _row1 = lasx_loadu_f32(dinr1);
+          if (channel - c > 2) _row2 = lasx_loadu_f32(dinr2);
+          if (channel - c > 3) _row3 = lasx_loadu_f32(dinr3);
+          if (channel - c > 4) _row4 = lasx_loadu_f32(dinr4);
+          if (channel - c > 5) _row5 = lasx_loadu_f32(dinr5);
+          if (channel - c > 6) _row6 = lasx_loadu_f32(dinr6);
+          transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+          lasx_storeu_f32(douth, _row0);
+          lasx_storeu_f32(douth + 8, _row1);
+          lasx_storeu_f32(douth + 16, _row2);
+          lasx_storeu_f32(douth + 24, _row3);
+          lasx_storeu_f32(douth + 32, _row4);
+          lasx_storeu_f32(douth + 40, _row5);
+          lasx_storeu_f32(douth + 48, _row6);
+          lasx_storeu_f32(douth + 56, _row7);
+          dinr0 += 8;
+          dinr1 += 8;
+          dinr2 += 8;
+          dinr3 += 8;
+          dinr4 += 8;
+          dinr5 += 8;
+          dinr6 += 8;
+          dinr7 += 8;
+          douth += 64;
+        }
+
+        for (; j < w_in; j++) {
+          douth[0] = *dinr0++;
+          douth[1] = channel - c > 1 ? *dinr1++ : 0;
+          douth[2] = channel - c > 2 ? *dinr2++ : 0;
+          douth[3] = channel - c > 3 ? *dinr3++ : 0;
+          douth[4] = channel - c > 4 ? *dinr4++ : 0;
+          douth[5] = channel - c > 5 ? *dinr5++ : 0;
+          douth[6] = channel - c > 6 ? *dinr6++ : 0;
+          douth[7] = 0;
+          douth += 8;
+        }
+      }
+      memset(douth, 0, right * block_channel * sizeof(float));
+    }
+    memset(dout + (h_in + top) * w_out * block_channel,
+           0,
+           bottom * w_out * block_channel * sizeof(float));
+  }
+}
+
+// output_trans [bs, (oc + 7)/8, oh, ow, 8] => output [bs, oc, oh, ow]
+void unpackC8_common(const float* din,
+                     float* dout,
+                     int size_out_channel,
+                     int channel) {
+  int block_channel = 8;
+  float* dout_init = dout;
+
+  for (int c = 0; c < channel; c += block_channel) {
+    dout = dout_init + c * size_out_channel;
+    auto doutr0 = dout;
+    auto doutr1 = doutr0 + size_out_channel;
+    auto doutr2 = doutr1 + size_out_channel;
+    auto doutr3 = doutr2 + size_out_channel;
+    auto doutr4 = doutr3 + size_out_channel;
+    auto doutr5 = doutr4 + size_out_channel;
+    auto doutr6 = doutr5 + size_out_channel;
+    auto doutr7 = doutr6 + size_out_channel;
+    int j = 0;
+    if (c + 7 < channel) {
+      for (; j + 7 < size_out_channel; j += 8) {
+        __m256 _row0 = lasx_loadu_f32(din);
+        __m256 _row1 = lasx_loadu_f32(din + 8);
+        __m256 _row2 = lasx_loadu_f32(din + 16);
+        __m256 _row3 = lasx_loadu_f32(din + 24);
+        __m256 _row4 = lasx_loadu_f32(din + 32);
+        __m256 _row5 = lasx_loadu_f32(din + 40);
+        __m256 _row6 = lasx_loadu_f32(din + 48);
+        __m256 _row7 = lasx_loadu_f32(din + 56);
+        transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+        lasx_storeu_f32(doutr0, _row0);
+        lasx_storeu_f32(doutr1, _row1);
+        lasx_storeu_f32(doutr2, _row2);
+        lasx_storeu_f32(doutr3, _row3);
+        lasx_storeu_f32(doutr4, _row4);
+        lasx_storeu_f32(doutr5, _row5);
+        lasx_storeu_f32(doutr6, _row6);
+        lasx_storeu_f32(doutr7, _row7);
+        doutr0 += 8;
+        doutr1 += 8;
+        doutr2 += 8;
+        doutr3 += 8;
+        doutr4 += 8;
+        doutr5 += 8;
+        doutr6 += 8;
+        doutr7 += 8;
+        din += 64;
+      }
+
+      for (; j < size_out_channel; j++) {
+        *doutr0++ = *din++;
+        *doutr1++ = *din++;
+        *doutr2++ = *din++;
+        *doutr3++ = *din++;
+        *doutr4++ = *din++;
+        *doutr5++ = *din++;
+        *doutr6++ = *din++;
+        *doutr7++ = *din++;
+      }
+    } else {
+      for (; j + 7 < size_out_channel; j += 8) {
+        __m256 _row0 = lasx_loadu_f32(din);
+        __m256 _row1 = lasx_loadu_f32(din + 8);
+        __m256 _row2 = lasx_loadu_f32(din + 16);
+        __m256 _row3 = lasx_loadu_f32(din + 24);
+        __m256 _row4 = lasx_loadu_f32(din + 32);
+        __m256 _row5 = lasx_loadu_f32(din + 40);
+        __m256 _row6 = lasx_loadu_f32(din + 48);
+        __m256 _row7 = lasx_loadu_f32(din + 56);
+        transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+        lasx_storeu_f32(doutr0, _row0);
+        if (channel - c > 1) lasx_storeu_f32(doutr1, _row1);
+        if (channel - c > 2) lasx_storeu_f32(doutr2, _row2);
+        if (channel - c > 3) lasx_storeu_f32(doutr3, _row3);
+        if (channel - c > 4) lasx_storeu_f32(doutr4, _row4);
+        if (channel - c > 5) lasx_storeu_f32(doutr5, _row5);
+        if (channel - c > 6) lasx_storeu_f32(doutr6, _row6);
+        doutr0 += 8;
+        doutr1 += 8;
+        doutr2 += 8;
+        doutr3 += 8;
+        doutr4 += 8;
+        doutr5 += 8;
+        doutr6 += 8;
+        doutr7 += 8;
+        din += 64;
+      }
+
+      for (; j < size_out_channel; j++) {
+        *doutr0++ = *din;
+        if (channel - c > 1) *doutr1++ = *(din + 1);
+        if (channel - c > 2) *doutr2++ = *(din + 2);
+        if (channel - c > 3) *doutr3++ = *(din + 3);
+        if (channel - c > 4) *doutr4++ = *(din + 4);
+        if (channel - c > 5) *doutr5++ = *(din + 5);
+        if (channel - c > 6) *doutr6++ = *(din + 6);
+        din += 8;
+      }
+    }
+  }
+}
+#endif
+
+#if __loongarch_sx
+void packC4_common(const float* din,
+                   float* dout,
+                   const std::vector<int>& pad,
+                   int h_in,
+                   int w_in,
+                   int channel) {
+  int top = pad[0];
+  int bottom = pad[1];
+  int left = pad[2];
+  int right = pad[3];
+  int w_out = (w_in + left + right);
+  int h_out = (h_in + top + bottom);
+  int block_channel = 4;
+  const float* din_init = din;
+  float* dout_init = dout;
+
+  for (int c = 0; c < channel; c += block_channel) {
+    din = din_init + c * h_in * w_in;
+    dout = dout_init + c * w_out * h_out;
+
+    memset(dout, 0, top * w_out * block_channel * sizeof(float));
+    auto dout_block = dout + top * w_out * block_channel;
+
+    for (int i = 0; i < h_in; i++) {
+      float* douth = dout_block + i * w_out * block_channel;
+      const float* dinh = din + i * w_in;
+      memset(douth, 0, left * block_channel * sizeof(float));
+      douth += left * block_channel;
+      int kernel_size = h_in * w_in;
+      auto dinr0 = dinh;
+      auto dinr1 = dinr0 + kernel_size;
+      auto dinr2 = dinr1 + kernel_size;
+      auto dinr3 = dinr2 + kernel_size;
+
+      int j = 0;
+      if (c + 3 < channel) {
+        for (; j + 3 < w_in; j += 4) {
+          __m128 _row0 = lsx_loadu_f32(dinr0);
+          __m128 _row1 = lsx_loadu_f32(dinr1);
+          __m128 _row2 = lsx_loadu_f32(dinr2);
+          __m128 _row3 = lsx_loadu_f32(dinr3);
+          transpose4_ps(_row0, _row1, _row2, _row3);
+          lsx_storeu_f32(douth, _row0);
+          lsx_storeu_f32(douth + 4, _row1);
+          lsx_storeu_f32(douth + 8, _row2);
+          lsx_storeu_f32(douth + 12, _row3);
+          dinr0 += 4;
+          dinr1 += 4;
+          dinr2 += 4;
+          dinr3 += 4;
+          douth += 16;
+        }
+
+        for (; j < w_in; j++) {
+          douth[0] = *dinr0++;
+          douth[1] = *dinr1++;
+          douth[2] = *dinr2++;
+          douth[3] = *dinr3++;
+          douth += 4;
+        }
+      } else {
+        __m128 _row0 = lsx_setzero_f32();
+        __m128 _row1 = lsx_setzero_f32();
+        __m128 _row2 = lsx_setzero_f32();
+        __m128 _row3 = lsx_setzero_f32();
+        for (; j + 3 < w_in; j += 4) {
+          _row0 = lsx_loadu_f32(dinr0);
+          if (channel - c > 1) _row1 = lsx_loadu_f32(dinr1);
+          if (channel - c > 2) _row2 = lsx_loadu_f32(dinr2);
+          if (channel - c > 3) _row3 = lsx_loadu_f32(dinr3);
+          transpose4_ps(_row0, _row1, _row2, _row3);
+          lsx_storeu_f32(douth, _row0);
+          lsx_storeu_f32(douth + 4, _row1);
+          lsx_storeu_f32(douth + 8, _row2);
+          lsx_storeu_f32(douth + 12, _row3);
+          dinr0 += 4;
+          dinr1 += 4;
+          dinr2 += 4;
+          dinr3 += 4;
+          douth += 16;
+        }
+
+        for (; j < w_in; j++) {
+          douth[0] = *dinr0++;
+          douth[1] = channel - c > 1 ? *dinr1++ : 0;
+          douth[2] = channel - c > 2 ? *dinr2++ : 0;
+          douth[3] = channel - c > 3 ? *dinr3++ : 0;
+          douth += 4;
+        }
+      }
+      memset(douth, 0, right * block_channel * sizeof(float));
+    }
+    memset(dout + (h_in + top) * w_out * block_channel,
+           0,
+           bottom * w_out * block_channel * sizeof(float));
+  }
+}
+
+void unpackC4_common(const float* din,
+                     float* dout,
+                     int size_out_channel,
+                     int channel) {
+  int block_channel = 4;
+  float* dout_init = dout;
+
+  for (int c = 0; c < channel; c += block_channel) {
+    dout = dout_init + c * size_out_channel;
+    auto doutr0 = dout;
+    auto doutr1 = doutr0 + size_out_channel;
+    auto doutr2 = doutr1 + size_out_channel;
+    auto doutr3 = doutr2 + size_out_channel;
+    int j = 0;
+    if (c + 3 < channel) {
+      for (; j + 3 < size_out_channel; j += 4) {
+        __m128 _row0 = lsx_loadu_f32(din);
+        __m128 _row1 = lsx_loadu_f32(din + 4);
+        __m128 _row2 = lsx_loadu_f32(din + 8);
+        __m128 _row3 = lsx_loadu_f32(din + 12);
+        transpose4_ps(_row0, _row1, _row2, _row3);
+        lsx_storeu_f32(doutr0, _row0);
+        lsx_storeu_f32(doutr1, _row1);
+        lsx_storeu_f32(doutr2, _row2);
+        lsx_storeu_f32(doutr3, _row3);
+        doutr0 += 4;
+        doutr1 += 4;
+        doutr2 += 4;
+        doutr3 += 4;
+        din += 16;
+      }
+
+      for (; j < size_out_channel; j++) {
+        *doutr0++ = *din++;
+        *doutr1++ = *din++;
+        *doutr2++ = *din++;
+        *doutr3++ = *din++;
+      }
+    } else {
+      for (; j + 3 < size_out_channel; j += 4) {
+        __m128 _row0 = lsx_loadu_f32(din);
+        __m128 _row1 = lsx_loadu_f32(din + 4);
+        __m128 _row2 = lsx_loadu_f32(din + 8);
+        __m128 _row3 = lsx_loadu_f32(din + 12);
+        transpose4_ps(_row0, _row1, _row2, _row3);
+        lsx_storeu_f32(doutr0, _row0);
+        if (channel - c > 1) lsx_storeu_f32(doutr1, _row1);
+        if (channel - c > 2) lsx_storeu_f32(doutr2, _row2);
+        if (channel - c > 3) lsx_storeu_f32(doutr3, _row3);
+        doutr0 += 4;
+        doutr1 += 4;
+        doutr2 += 4;
+        doutr3 += 4;
+        din += 16;
+      }
+
+      for (; j < size_out_channel; j++) {
+        *doutr0++ = *din;
+        if (channel - c > 1) *doutr1++ = *(din + 1);
+        if (channel - c > 2) *doutr2++ = *(din + 2);
+        if (channel - c > 3) *doutr3++ = *(din + 3);
+        din += 4;
+      }
+    }
+  }
+}
+#endif
+
+#if __loongarch_asx
+__m256 activation8_m256(__m256 input,
+                        const lite_api::ActivationType act_type,
+                        const operators::ActivationParam act_param) {
+  if (act_type == lite_api::ActivationType::kRelu) {
+    return lasx_max_f32(input, lasx_setzero_f32());
+  } else if (act_type == lite_api::ActivationType::kRelu6) {
+    __m256 _val = lasx_max_f32(input, lasx_setzero_f32());
+    return lasx_min_f32(_val, lasx_set1_f32(act_param.Relu_clipped_coef));
+  } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+    __m256 _val_scale =
+        lasx_mul_f32(input, lasx_set1_f32(act_param.Leaky_relu_alpha));
+    return lasx_blendv_f32(
+        _val_scale,
+        input,
+        lasx_xvfcmp_slt_s(lasx_setzero_f32(), input));
+  } else if (act_type == lite_api::ActivationType::kHardSwish) {
+    __m256 _val_offset =
+        lasx_add_f32(input, lasx_set1_f32(act_param.hard_swish_offset));
+    __m256 _val_scale =
+        lasx_mul_f32(input, lasx_set1_f32(1.0 / act_param.hard_swish_scale));
+    __m256 _val =
+        lasx_min_f32(lasx_set1_f32(act_param.hard_swish_threshold),
+                      lasx_max_f32(_val_offset, lasx_setzero_f32()));
+    return lasx_mul_f32(_val, _val_scale);
+  } else {
+    LOG(FATAL) << "[LoongArch] activation type not supported";
+  }
+  return lasx_setzero_f32();
+}
+#endif
+
+#if __loongarch_sx
+__m128 activation4_m128(__m128 input,
+                        const lite_api::ActivationType act_type,
+                        const operators::ActivationParam act_param) {
+  if (act_type == lite_api::ActivationType::kRelu) {
+    return lsx_max_f32(input, lsx_setzero_f32());
+  } else if (act_type == lite_api::ActivationType::kRelu6) {
+    __m128 _val = lsx_max_f32(input, lsx_setzero_f32());
+    return lsx_min_f32(_val, lsx_set1_f32(act_param.Relu_clipped_coef));
+  } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+    __m128 _val_scale =
+        lsx_mul_f32(input, lsx_set1_f32(act_param.Leaky_relu_alpha));
+    return lsx_blendv_f32(
+        _val_scale, input, lsx_vfcmp_slt_s(lsx_setzero_f32(), input));
+  } else if (act_type == lite_api::ActivationType::kHardSwish) {
+    __m128 _val_offset =
+        lsx_add_f32(input, lsx_set1_f32(act_param.hard_swish_offset));
+    __m128 _val_scale =
+        lsx_mul_f32(input, lsx_set1_f32(1.0 / act_param.hard_swish_scale));
+    __m128 _val = lsx_min_f32(lsx_set1_f32(act_param.hard_swish_threshold),
+                             lsx_max_f32(_val_offset, lsx_setzero_f32()));
+    return lsx_mul_f32(_val, _val_scale);
+  } else {
+    LOG(FATAL) << "[LoongArch] activation type not supported";
+  }
+  return lsx_setzero_f32();
+}
+#endif
+
+float activation1_float(float input,
+                        const lite_api::ActivationType act_type,
+                        const operators::ActivationParam act_param) {
+  if (act_type == lite_api::ActivationType::kRelu) {
+    return (std::max)(input, 0.f);
+  } else if (act_type == lite_api::ActivationType::kRelu6) {
+    return (std::min)((std::max)(input, 0.f), act_param.Relu_clipped_coef);
+  } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+    return input > 0.f ? input : input * act_param.Leaky_relu_alpha;
+  } else if (act_type == lite_api::ActivationType::kHardSwish) {
+    return ((std::min)(act_param.hard_swish_threshold,
+                       (std::max)(0.f, input + act_param.hard_swish_offset)) *
+            input / act_param.hard_swish_scale);
+  } else {
+    LOG(FATAL) << "[LoongArch] activation type not supported";
+  }
+  return 0.f;
+}
+
+/**
+ * \brief inline funcs used in im2col
+ * @param a
+ * @param b
+ * @return
+ */
+inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
+  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
+}
+
+/**
+ * \brief normal im2col function for gemm conv
+ * @tparam dtype
+ * @param data_im
+ * @param channels
+ * @param height
+ * @param width
+ * @param kernel_size
+ * @param pad
+ * @param stride
+ * @param data_col
+ */
+template <typename Dtype>
+void im2col_common(const Dtype* data_im,
+                   int channels,
+                   int height,
+                   int width,
+                   int kernel_h,
+                   int kernel_w,
+                   int pad_top,
+                   int pad_bottom,
+                   int pad_left,
+                   int pad_right,
+                   int stride_h,
+                   int stride_w,
+                   int dilation_h,
+                   int dilation_w,
+                   Dtype* data_col) {
+  const int output_h =
+      (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
+  const int output_w =
+      (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) /
+          stride_w +
+      1;
+  const int channel_size = height * width;
+  for (int channel = channels; channel--; data_im += channel_size) {
+    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_row = -pad_top + kernel_row * dilation_h;
+        for (int output_rows = output_h; output_rows; output_rows--) {
+          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+            for (int output_cols = output_w; output_cols; output_cols--) {
+              *(data_col++) = 0;
+            }
+          } else {
+            int input_col = -pad_left + kernel_col * dilation_w;
+            for (int output_col = output_w; output_col; output_col--) {
+              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                *(data_col++) = data_im[input_row * width + input_col];
+              } else {
+                *(data_col++) = 0;
+              }
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void im2col_s1<float>(const float* data_im,
+                      int channels,
+                      int height,
+                      int width,
+                      int kernel_h,
+                      int kernel_w,
+                      int pad_top,
+                      int pad_bottom,
+                      int pad_left,
+                      int pad_right,
+                      int dilation_h,
+                      int dilation_w,
+                      float* data_col) {
+  const int output_h =
+      (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) + 1;
+  const int output_w =
+      (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) + 1;
+  const int in_channel_size = height * width;
+  const int out_channel_size = output_h * output_w;
+  const unsigned int output_plane_size =
+      output_h * output_w * kernel_h * kernel_w;
+  size_t tmp_size = static_cast<size_t>(output_plane_size);
+  size_t mem_size = tmp_size * channels * sizeof(float);
+  memset(data_col, 0, mem_size);
+#pragma omp parallel for
+  for (int c = 0; c < channels; c++) {
+    unsigned int data_im_z = static_cast<unsigned int>(c * in_channel_size);
+    int data_col_z1 = c * output_plane_size;
+    for (int ky = 0, h_offset = 0; ky < kernel_h;
+         ky++, h_offset += dilation_h) {
+      int data_col_z2 = ky * out_channel_size * kernel_w;
+      for (int kx = 0, w_offset = 0; kx < kernel_w;
+           kx++, w_offset += dilation_w) {
+        int data_col_z3 = kx * out_channel_size;
+        unsigned int data_col_z =
+            static_cast<unsigned int>(data_col_z1 + data_col_z2 + data_col_z3);
+        int oh_begin = std::max(((pad_top - h_offset)), 0);  // always >= 0
+        int oh_end = std::min(((height + pad_bottom - h_offset)), output_h);
+        oh_end = std::max(oh_begin, oh_end);
+        int ow_begin = std::max(((pad_left - w_offset)), 0);
+        int ow_end = std::min(((width + pad_right - w_offset)), output_w);
+        ow_end = std::max(ow_begin, ow_end);
+        int ih = oh_begin - pad_top + h_offset;
+        for (int oh = oh_begin; oh < oh_end; ++oh, ++ih) {
+          int iw = ow_begin - pad_left + w_offset;
+          int ow = ow_begin;
+          unsigned int data_im_offset = data_im_z + ih * width;
+          unsigned int data_col_offset = data_col_z + oh * output_w;
+          const float* data_im_ptr = data_im + data_im_offset;
+          float* data_col_ptr = data_col + data_col_offset;
+          for (; ow + 7 < ow_end; ow += 8, iw += 8) {
+#if __loongarch_asx
+            __m256 vtmp = lasx_loadu_f32(data_im_ptr + iw);
+            lasx_storeu_f32(data_col_ptr + ow, vtmp);
+#else
+            __m128 vtmp1 = lsx_loadu_f32(data_im_ptr + iw);     //TODO CHECK ME
+            __m128 vtmp2 = lsx_loadu_f32(data_im_ptr + iw + 4);
+            lsx_storeu_f32(data_col_ptr + ow, vtmp1);
+            lsx_storeu_f32(data_col_ptr + ow + 4, vtmp2);
+#endif
+          }
+          for (; ow < ow_end; ++ow, ++iw) {
+            data_col[data_col_offset + ow] = data_im[data_im_offset + iw];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <>
+void im2col_s2<float>(const float* data_im,
+                      int channels,
+                      int height,
+                      int width,
+                      int kernel_h,
+                      int kernel_w,
+                      int pad_top,
+                      int pad_bottom,
+                      int pad_left,
+                      int pad_right,
+                      int dilation_h,
+                      int dilation_w,
+                      float* data_col) {
+  const int output_h =
+      (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) / 2 +
+      1;
+  const int output_w =
+      (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) / 2 +
+      1;
+  const int in_channel_size = height * width;
+  const unsigned int output_plane_size =
+      output_h * output_w * kernel_h * kernel_w;
+  size_t tmp_size = static_cast<size_t>(output_plane_size);
+  size_t mem_size = tmp_size * channels * sizeof(float);
+  memset(data_col, 0, mem_size);
+#pragma omp parallel for
+  for (int c = 0; c < channels; c++) {
+    unsigned int data_im_z = static_cast<unsigned int>(c * in_channel_size);
+    int data_col_z1 = c * output_plane_size;
+    for (int ky = 0, h_offset = 0; ky < kernel_h;
+         ky++, h_offset += dilation_h) {
+      int data_col_z2 = ky * output_h * output_w * kernel_w;
+      for (int kx = 0, w_offset = 0; kx < kernel_w;
+           kx++, w_offset += dilation_w) {
+        int data_col_z3 = kx * output_h * output_w;
+        unsigned int data_col_z =
+            static_cast<unsigned int>(data_col_z1 + data_col_z2 + data_col_z3);
+        int oh_begin = std::max(((pad_top - h_offset + 1) / 2), 0);
+        int oh_end =
+            std::min(((height + pad_bottom - h_offset + 1) / 2), output_h);
+        oh_end = std::max(oh_begin, oh_end);
+        int ow_begin = std::max(((pad_left - w_offset + 1) / 2), 0);
+        int ow_end =
+            std::min(((width + pad_right - w_offset + 1) / 2), output_w);
+        ow_end = std::max(ow_begin, ow_end);
+        int ih = oh_begin * 2 - pad_top + h_offset;
+        for (int oh = oh_begin; oh < oh_end; ++oh, ih += 2) {
+          int iw = ow_begin * 2 - pad_left + w_offset;
+          int ow = ow_begin;
+          unsigned int data_im_offset = data_im_z + ih * width;
+          unsigned int data_col_offset = data_col_z + oh * output_w;
+          const float* data_im_ptr = data_im + data_im_offset;
+          float* data_col_ptr = data_col + data_col_offset;
+          for (; ow + 3 < ow_end; ow += 4, iw += 8) {
+            // a0a1a2a3
+            __m128 vtmp0 = lsx_loadu_f32(data_im_ptr + iw);
+            // a4a5a6a7
+            __m128 vtmp1 = lsx_loadu_f32(data_im_ptr + iw + 4);
+            // a0a2a4a6
+            lsx_storeu_f32(data_col_ptr + ow,
+                          lsx_shuffle_f32(vtmp0, vtmp1, 0x88));
+          }
+          for (; ow < ow_end; ++ow, iw += 2) {
+            data_col[data_col_offset + ow] = data_im[data_im_offset + iw];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief normal im2col function for gemm conv
+ * @param data_im
+ * @param channels
+ * @param height
+ * @param width
+ * @param kernel_size
+ * @param pad
+ * @param stride
+ * @param data_col
+ */
+template <>
+void im2col<float>(const float* data_im,
+                   int channels,
+                   int height,
+                   int width,
+                   int kernel_h,
+                   int kernel_w,
+                   int pad_top,
+                   int pad_bottom,
+                   int pad_left,
+                   int pad_right,
+                   int stride_h,
+                   int stride_w,
+                   int dilation_h,
+                   int dilation_w,
+                   float* data_col) {
+  bool pads_equal = ((pad_top == pad_bottom) && (pad_left == pad_right));
+  bool pads_all_equal = (pads_equal && pad_top == pad_left);
+  bool ks_equal = (stride_h == stride_w) && (kernel_h == kernel_w);
+  bool no_dilation = (dilation_h == 1) && (dilation_w == 1);
+  bool kspd = pads_all_equal && ks_equal && no_dilation;
+  if (kspd && stride_h == 1) {
+    im2col_s1<float>(data_im,
+                     channels,
+                     height,
+                     width,
+                     kernel_h,
+                     kernel_w,
+                     pad_top,
+                     pad_bottom,
+                     pad_left,
+                     pad_right,
+                     dilation_h,
+                     dilation_w,
+                     data_col);
+  } else if (kspd && stride_h == 2) {
+    im2col_s2<float>(data_im,
+                     channels,
+                     height,
+                     width,
+                     kernel_h,
+                     kernel_w,
+                     pad_top,
+                     pad_bottom,
+                     pad_left,
+                     pad_right,
+                     dilation_h,
+                     dilation_w,
+                     data_col);
+  } else {
+    im2col_common<float>(data_im,
+                         channels,
+                         height,
+                         width,
+                         kernel_h,
+                         kernel_w,
+                         pad_top,
+                         pad_bottom,
+                         pad_left,
+                         pad_right,
+                         stride_h,
+                         stride_w,
+                         dilation_h,
+                         dilation_w,
+                         data_col);
+  }
+}
+
+template <>
+void im2col<int8_t>(const int8_t* data_im,
+                    int channels,
+                    int height,
+                    int width,
+                    int kernel_h,
+                    int kernel_w,
+                    int pad_top,
+                    int pad_bottom,
+                    int pad_left,
+                    int pad_right,
+                    int stride_h,
+                    int stride_w,
+                    int dilation_h,
+                    int dilation_w,
+                    int8_t* data_col) {
+  const int output_h =
+      (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
+  const int output_w =
+      (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) /
+          stride_w +
+      1;
+  const int channel_size = height * width;
+  for (int channel = channels; channel--; data_im += channel_size) {
+    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_row = -pad_top + kernel_row * dilation_h;
+        for (int output_rows = output_h; output_rows; output_rows--) {
+          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+            for (int output_cols = output_w; output_cols; output_cols--) {
+              *(data_col++) = 0;
+            }
+          } else {
+            int input_col = -pad_left + kernel_col * dilation_w;
+            for (int output_col = output_w; output_col; output_col--) {
+              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                *(data_col++) = data_im[input_row * width + input_col];
+              } else {
+                *(data_col++) = 0;
+              }
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/common/conv_utils.h b/lite/backends/loongarch/math/common/conv_utils.h
new file mode 100644
index 00000000000..ad12acc24d2
--- /dev/null
+++ b/lite/backends/loongarch/math/common/conv_utils.h
@@ -0,0 +1,244 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/loongarch/xxl.h"
+#include <vector>
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+// tranpose [chout, chin, wh, ww] to [chout/block,chin,wh,ww,block]
+// dout space should be allocated before calling conv_trans_weights_numc
+void conv_trans_weights_numc(const float* din,
+                             float* dout,  // dout has been expanded
+                             int chout,
+                             int chin,
+                             int wh,
+                             int ww,
+                             int block);
+
+// tranpose [chout,chin,wh,ww] to [chout/block,wh,ww,chin,block]
+// this function is different from conv_trans_weights_numc just
+// in that we make chw->hwc
+void conv_trans_weights_numc_c3(const float* din,
+                                float* dout,
+                                int chout,
+                                int chin,
+                                int wh,
+                                int ww,
+                                int block);
+
+// for input and filter pack
+void pack8_m256(lite::Tensor* input,
+                lite::Tensor* output,
+                const int channel_num,
+                const bool is_filter);
+void pack4_m128(lite::Tensor* input,
+                lite::Tensor* output,
+                const int channel_num,
+                const bool is_filter);
+
+// for output unpack
+void unpack8_m256(lite::Tensor* input, lite::Tensor* output);
+void unpack4_m128(lite::Tensor* input, lite::Tensor* output);
+
+#if __loongarch_asx
+// for input padding
+void padding8_m256(lite::Tensor* input,
+                   lite::Tensor* output,
+                   const std::vector<int>& paddings);
+#endif
+void padding4_m128(lite::Tensor* input,
+                   lite::Tensor* output,
+                   const std::vector<int>& paddings);
+void padding1_float(lite::Tensor* input,
+                    lite::Tensor* output,
+                    const std::vector<int>& paddings);
+#if __loongarch_asx
+void pack_padding8_m256(lite::Tensor* input,
+                        lite::Tensor* output,
+                        const int channel_num,
+                        const std::vector<int>& paddings);
+#endif
+
+// for activation - only support relu, relu6, leakyRelu, hard_swish
+#ifdef __loongarch_asx
+__m256 activation8_m256(__m256 input,
+                        const lite_api::ActivationType act_type,
+                        const operators::ActivationParam act_param);
+#endif
+__m128 activation4_m128(__m128 input,
+                        const lite_api::ActivationType act_type,
+                        const operators::ActivationParam act_param);
+float activation1_float(float input,
+                        const lite_api::ActivationType act_type,
+                        const operators::ActivationParam act_param);
+#if __loongarch_asx
+void packC8_common(const float* din,
+                   float* dout,
+                   const std::vector<int>& pad,
+                   int h_in,
+                   int w_in,
+                   int channel);
+
+void unpackC8_common(const float* din,
+                     float* dout,
+                     int size_out_channel,
+                     int channel);
+#endif
+
+#if __loongarch_sx
+void packC4_common(const float* din,
+                   float* dout,
+                   const std::vector<int>& pad,
+                   int h_in,
+                   int w_in,
+                   int channel);
+
+void unpackC4_common(const float* din,
+                     float* dout,
+                     int size_out_channel,
+                     int channel);
+#endif
+
+template <typename Dtype>
+void im2col(const Dtype* data_im,
+            int channels,
+            int height,
+            int width,
+            int kernel_h,
+            int kernel_w,
+            int pad_top,
+            int pad_bottom,
+            int pad_left,
+            int pad_right,
+            int stride_h,
+            int stride_w,
+            int dilation_h,
+            int dilation_w,
+            Dtype* data_col);
+
+template <typename Dtype>
+void im2col_common(const Dtype* data_im,
+                   int channels,
+                   int height,
+                   int width,
+                   int kernel_h,
+                   int kernel_w,
+                   int pad_top,
+                   int pad_bottom,
+                   int pad_left,
+                   int pad_right,
+                   int stride_h,
+                   int stride_w,
+                   int dilation_h,
+                   int dilation_w,
+                   Dtype* data_col);
+
+template <typename Dtype>
+void im2col_s1(const Dtype* data_im,
+               int channels,
+               int height,
+               int width,
+               int kernel_h,
+               int kernel_w,
+               int pad_top,
+               int pad_bottom,
+               int pad_left,
+               int pad_right,
+               int dilation_h,
+               int dilation_w,
+               Dtype* data_col);
+
+template <typename Dtype>
+void im2col_s2(const Dtype* data_im,
+               int channels,
+               int height,
+               int width,
+               int kernel_h,
+               int kernel_w,
+               int pad_top,
+               int pad_bottom,
+               int pad_left,
+               int pad_right,
+               int dilation_h,
+               int dilation_w,
+               Dtype* data_col);
+
+#ifdef __loongarch_asx
+// From: https://stackoverflow.com/a/25627536
+inline void transpose8_ps(__m256& row0,  // NOLINT
+                          __m256& row1,  // NOLINT
+                          __m256& row2,  // NOLINT
+                          __m256& row3,  // NOLINT
+                          __m256& row4,  // NOLINT
+                          __m256& row5,  // NOLINT
+                          __m256& row6,  // NOLINT
+                          __m256& row7   // NOLINT
+                          ) {
+  __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
+  __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
+  __t0 = lasx_unpacklo_f32(row0, row1);
+  __t1 = lasx_unpackhi_f32(row0, row1);
+  __t2 = lasx_unpacklo_f32(row2, row3);
+  __t3 = lasx_unpackhi_f32(row2, row3);
+  __t4 = lasx_unpacklo_f32(row4, row5);
+  __t5 = lasx_unpackhi_f32(row4, row5);
+  __t6 = lasx_unpacklo_f32(row6, row7);
+  __t7 = lasx_unpackhi_f32(row6, row7);
+  __tt0 = lasx_shuffle_f32(__t0, __t2, LSX_SHUFFLE(1, 0, 1, 0));
+  __tt1 = lasx_shuffle_f32(__t0, __t2, LSX_SHUFFLE(3, 2, 3, 2));
+  __tt2 = lasx_shuffle_f32(__t1, __t3, LSX_SHUFFLE(1, 0, 1, 0));
+  __tt3 = lasx_shuffle_f32(__t1, __t3, LSX_SHUFFLE(3, 2, 3, 2));
+  __tt4 = lasx_shuffle_f32(__t4, __t6, LSX_SHUFFLE(1, 0, 1, 0));
+  __tt5 = lasx_shuffle_f32(__t4, __t6, LSX_SHUFFLE(3, 2, 3, 2));
+  __tt6 = lasx_shuffle_f32(__t5, __t7, LSX_SHUFFLE(1, 0, 1, 0));
+  __tt7 = lasx_shuffle_f32(__t5, __t7, LSX_SHUFFLE(3, 2, 3, 2));
+  row0 = lasx_permute2f128_f32(__tt0, __tt4, 0x20);
+  row1 = lasx_permute2f128_f32(__tt1, __tt5, 0x20);
+  row2 = lasx_permute2f128_f32(__tt2, __tt6, 0x20);
+  row3 = lasx_permute2f128_f32(__tt3, __tt7, 0x20);
+  row4 = lasx_permute2f128_f32(__tt0, __tt4, 0x31);
+  row5 = lasx_permute2f128_f32(__tt1, __tt5, 0x31);
+  row6 = lasx_permute2f128_f32(__tt2, __tt6, 0x31);
+  row7 = lasx_permute2f128_f32(__tt3, __tt7, 0x31);
+}
+#endif
+
+#if __loongarch_sx
+inline void transpose4_ps(__m128& row0,
+                          __m128& row1,
+                          __m128& row2,
+                          __m128& row3) {
+  __m128 tmp3, tmp2, tmp1, tmp0;
+  tmp0 = lsx_unpacklo_f32((row0), (row1));
+  tmp2 = lsx_unpacklo_f32((row2), (row3));
+  tmp1 = lsx_unpackhi_f32((row0), (row1));
+  tmp3 = lsx_unpackhi_f32((row2), (row3));
+  row0 = lsx_movelh_f32(tmp0, tmp2);
+  row1 = lsx_movehl_f32(tmp2, tmp0);
+  row2 = lsx_movelh_f32(tmp1, tmp3);
+  row3 = lsx_movehl_f32(tmp3, tmp1);
+}
+#endif
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/concat_and_split.cc b/lite/backends/loongarch/math/concat_and_split.cc
new file mode 100644
index 00000000000..276670a65c2
--- /dev/null
+++ b/lite/backends/loongarch/math/concat_and_split.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/concat_and_split.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const std::vector<lite::Tensor>& input,
+                  int axis,
+                  lite::Tensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int num = input.size();
+
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int out_rows = rows, out_cols = 0;
+
+    std::vector<int64_t> input_cols(input.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+
+    // computation
+    auto output_data = output->template mutable_data<T>();
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = input_cols[j];
+      auto* input_data = input[j].data<T>();
+      for (int k = 0; k < out_rows; ++k) {
+        // memory::Copy(cpu_place, output_data + k * out_cols + col_idx,
+        // cpu_place,
+        //             input_data + k * col_len, sizeof(T) * col_len);
+        std::copy_n(input_data + k * col_len,
+                    col_len,
+                    output_data + k * out_cols + col_idx);
+      }
+      col_idx += col_len;
+    }
+  }
+};
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SplitFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const std::vector<const lite::Tensor*>& ref_inputs,
+                  const int axis,
+                  std::vector<lite::Tensor*>* outputs) {
+    // TODO(zcd): Add input data validity checking
+    size_t num = outputs->size();
+
+    int input_rows = 1;
+    auto dim_0 = ref_inputs[0]->dims();
+    for (int i = 0; i < axis; ++i) {
+      input_rows *= dim_0[i];
+    }
+
+    int input_cols = 0;
+
+    std::vector<int64_t> output_cols(outputs->size());
+    for (size_t i = 0; i < num; ++i) {
+      int t_cols = ref_inputs[i]->numel() / input_rows;
+      input_cols += t_cols;
+      output_cols[i] = t_cols;
+    }
+    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+
+    // computation
+    for (int k = 0; k < input_rows; ++k) {
+      const T* src_ptr = input.data<T>() + k * input_cols;
+      int col_idx = 0;
+      for (size_t j = 0; j < num; ++j) {
+        int col_len = output_cols[j];
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->template mutable_data<T>() + k * col_len;
+          std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
+          // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+          //             sizeof(T) * col_len);
+        }
+        col_idx += col_len;
+      }
+    }
+  }
+};
+
+#define DEFINE_FUNCTOR(type)                                  \
+  template class ConcatFunctor<lite::TargetType::kLoongArch, type>; \
+  template class SplitFunctor<lite::TargetType::kLoongArch, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/concat_and_split.h b/lite/backends/loongarch/math/concat_and_split.h
new file mode 100644
index 00000000000..9f2fd8830c9
--- /dev/null
+++ b/lite/backends/loongarch/math/concat_and_split.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/loongarch/fluid/data_type.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <lite::TargetType Target, typename T>
+class ConcatFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const std::vector<lite::Tensor>& input,
+                  int axis,
+                  lite::Tensor* output);
+};
+
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <lite::TargetType Target, typename T>
+class SplitFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const std::vector<const lite::Tensor*>& ref_inputs,
+                  int axis,
+                  std::vector<lite::Tensor*>* outputs);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
+
+#define FOR_ALL_TYPES(macro) \
+  macro(int);                \
+  macro(float);              \
+  macro(double);             \
+  macro(bool);               \
+  macro(int64_t);            \
+  macro(int16_t);            \
+  macro(uint8_t);            \
+  macro(int8_t);             \
+  macro(::paddle::lite::fluid::float16)
diff --git a/lite/backends/loongarch/math/context_project.cc b/lite/backends/loongarch/math/context_project.cc
new file mode 100644
index 00000000000..9f071dde134
--- /dev/null
+++ b/lite/backends/loongarch/math/context_project.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/context_project.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template class ContextProjectFunctor<lite::TargetType::kLoongArch, float>;
+template class ContextProjectFunctor<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/context_project.h b/lite/backends/loongarch/math/context_project.h
new file mode 100644
index 00000000000..687ee625bb1
--- /dev/null
+++ b/lite/backends/loongarch/math/context_project.h
@@ -0,0 +1,361 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/im2col.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * \brief Context projection concatenates features in adjacent time-steps in
+ * a sequence. The i-th row of the output is the concatenation of
+ * context_length rows of the input. The context_length rows are the
+ * consecutive rows from the i+shift_start row.
+ * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor.
+ *
+ * \param in            Input data.
+ * \param Shape         The shape of Input data:
+ *                        [mini-batch, input_hidden_size].
+ *
+ * \param padding_data  Padding data.
+ * \param Shape         The shape of Padding data:
+ *                        [up_pad + down_pad, input_hidden_size].
+ *
+ * \param col           Col data.
+ * \param Shape         The shape of Col data:
+ *                        [mini-batch, context_length * input_hidden_size].
+ *
+ * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
+ * time-steps:
+ *
+ * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3,
+ * 4].
+ * Besides, for the sake of simplicity, we assume M=1 and N=2.
+ *
+ * X = [[a1, a2;
+ *       b1, b2;
+ *       c1, c2]
+ *      [d1, d2]]
+ *
+ * This is to say that input (X) has 4 words and the dimension of each word
+ * representation is 2.
+ *
+ * - Case1:
+ *   If context_start is -1 and padding_trainable is false, we use zero to pad
+ *   instead of learned weight to pad,
+ *   and the context_length is 3, the output (Out) is:
+ *
+ *   Out =[[0,  0,  a1, a2, b1, b2;
+ *          a1, a2, b1, b2, c1, c2;
+ *          b1, b2, c1, c2, 0,  0 ]
+ *          [0,  0, d1, d2, 0,  0 ]]
+ *
+ * - Case2:
+ *   If context_start is -1 and padding_trainable is true, we use learned weight
+ *   to pad,
+ *   and the context_length is 3, the output (Out) is:
+ *
+ *   Out = [[w1, w2, a1, a2, b1, b2;
+ *           a1, a2, b1, b2, c1, c2;
+ *           b1, b2, c1, c2, w3, w4]
+ *          [w1, w2, d1, d2, w3, w4]]
+ *
+ */
+
+template <lite::TargetType Target, typename T>
+class ContextProjectFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& in,
+                  const lite::Tensor* padding_data,
+                  bool padding_trainable,
+                  const int context_start,
+                  const int context_length,
+                  const int context_stride,
+                  const int up_pad,
+                  const int down_pad,
+                  lite::Tensor* col) {
+    auto lod_level_0 = in.lod()[0];
+
+    math::Im2ColFunctor<math::ColFormat::kOCF, Target, float> im2col_ocf;
+
+    std::vector<int> dilation({1, 1});
+    std::vector<int> padding({up_pad, 0, down_pad, 0});
+    std::vector<int> stride({context_stride, 1});
+
+    int input_row_begin, input_row_end;
+    int sequence_height, sequence_width;
+    sequence_width = in.dims()[1];
+
+    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
+      input_row_begin = (context_start > 0)
+                            ? static_cast<int>(lod_level_0[i]) + context_start
+                            : static_cast<int>(lod_level_0[i]);
+      input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+      // lite::Tensor out_t =
+      // col->Slice<float>(static_cast<int>(lod_level_0[i]),
+      //                          static_cast<int>(lod_level_0[i + 1]));
+      lite::Tensor out_t =
+          col->Slice<float>(static_cast<int64_t>(lod_level_0[i]),
+                            static_cast<int>(lod_level_0[i + 1]));
+
+      sequence_height = static_cast<int>(out_t.dims()[0]);
+
+      if (input_row_begin < input_row_end) {
+        lite::Tensor in_t = in.Slice<float>(input_row_begin, input_row_end);
+
+        std::vector<int64_t> output_shape(
+            {sequence_height,
+             1,
+             1,
+             context_length,
+             sequence_width});  // output_height, output_width,
+        // input_channels, filter_height, filter_width
+        out_t.Resize(output_shape);
+
+        std::vector<int64_t> input_shape(
+            {1,
+             input_row_end - input_row_begin,
+             sequence_width});  // input_channels, input_height, input_width
+        in_t.Resize(input_shape);
+        im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
+        out_t.Resize({sequence_height, context_length * sequence_width});
+      }
+    }
+    if (padding_trainable) {
+      CHECK(padding_data != nullptr);
+      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
+        lite::Tensor out_t =
+            col->Slice<float>(static_cast<int>(lod_level_0[i]),
+                              static_cast<int>(lod_level_0[i + 1]));
+
+        sequence_height = static_cast<int>(out_t.dims()[0]);
+
+        // add up trainable data
+        out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
+                      sequence_width});
+
+        if (up_pad > 0) {  // add up pad
+          int padding_rows = (std::min)(
+              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
+
+          for (int k = 0; k < padding_rows; ++k) {
+            int padding_size =
+                k + context_length < up_pad ? context_length : up_pad - k;
+            lite::Tensor out_t_sub = out_t.Slice<float>(
+                k * context_length, k * context_length + padding_size);
+            lite::Tensor w_sub =
+                padding_data->Slice<float>(k, k + padding_size);
+
+            out_t_sub.CopyDataFrom(w_sub);
+
+            // framework::TensorCopy(w_sub, context.GetPlace(), context,
+            //                      &out_t_sub);
+          }
+        }
+        if (down_pad > 0) {  // add down pad
+          int down_pad_begin_row =
+              (std::max)(
+                  0, (sequence_height - context_start - context_length) + 1) +
+              1;
+          int padding_begin = (std::max)(0, context_start - sequence_height);
+          int padding_size =
+              sequence_height - context_start >= context_length
+                  ? 1
+                  : context_length - (sequence_height - context_start);
+          if (context_start >= sequence_height) padding_size = context_length;
+          int padding_idx = padding_begin;
+          for (int t = 0; t + down_pad_begin_row <= sequence_height;
+               ++t, ++padding_size) {
+            if (context_start >= sequence_height) padding_size = context_length;
+            if (padding_size > context_length) {
+              padding_size = context_length;
+              padding_idx++;
+            }
+            if (padding_begin > 0 || sequence_height == context_start)
+              padding_idx = padding_begin + t;
+
+            lite::Tensor out_t_sub = out_t.Slice<float>(
+                (down_pad_begin_row + t) * context_length - padding_size,
+                (down_pad_begin_row + t) * context_length);
+            lite::Tensor w_sub = padding_data->Slice<float>(
+                up_pad + padding_idx, up_pad + padding_idx + padding_size);
+            out_t_sub.CopyDataFrom(w_sub);
+            // framework::TensorCopy(w_sub, context.GetPlace(), context,
+            //                      &out_t_sub);
+          }
+        }
+        out_t.Resize({sequence_height,
+                      static_cast<int64_t>(context_length) * sequence_width});
+      }
+    }
+  }
+};
+
+template <lite::TargetType Target, typename T>
+class ContextProjectGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& in,
+                  bool padding_trainable,
+                  const int context_start,
+                  const int context_length,
+                  const int context_stride,
+                  const int up_pad,
+                  const int down_pad,
+                  bool pad_grad,
+                  bool input_grad,
+                  lite::Tensor* padding_data,
+                  lite::Tensor* col) {
+    auto lod_level_0 = in.lod()[0];
+
+    math::Col2ImFunctor<math::ColFormat::kOCF, Target, float> col2im_ocf;
+
+    std::vector<int> dilation({1, 1});
+    std::vector<int> padding({up_pad, 0, down_pad, 0});
+    std::vector<int> stride({context_stride, 1});
+
+    int input_row_begin, input_row_end;
+    int sequence_height, sequence_width;
+    sequence_width = in.dims()[1];
+    auto blas = math::GetBlas<lite::Context<Target>, T>(context);
+
+    if (input_grad) {
+      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
+        input_row_begin = (context_start > 0)
+                              ? static_cast<int>(lod_level_0[i]) + context_start
+                              : static_cast<int>(lod_level_0[i]);
+        input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+        lite::Tensor out_t =
+            col->Slice<float>(static_cast<int>(lod_level_0[i]),
+                              static_cast<int>(lod_level_0[i + 1]));
+
+        sequence_height = static_cast<int>(out_t.dims()[0]);
+
+        if (input_row_begin < input_row_end) {
+          lite::Tensor in_t = in.Slice<float>(input_row_begin, input_row_end);
+
+          std::vector<int64_t> output_shape(
+              {sequence_height,
+               1,
+               1,
+               context_length,
+               sequence_width});  // output_height, output_width,
+          // input_channels, filter_height, filter_width
+          out_t.Resize(output_shape);
+
+          std::vector<int64_t> input_shape(
+              {1,
+               input_row_end - input_row_begin,
+               sequence_width});  // input_channels, input_height, input_width
+          in_t.Resize(input_shape);
+
+          col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
+          out_t.Resize({sequence_height, context_length * sequence_width});
+        }
+      }
+    }
+    if (pad_grad) {
+      if (padding_trainable) {
+        for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+          if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
+          lite::Tensor out_t =
+              col->Slice<float>(static_cast<int>(lod_level_0[i]),
+                                static_cast<int>(lod_level_0[i + 1]));
+
+          sequence_height = static_cast<int>(out_t.dims()[0]);
+          out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
+                        sequence_width});
+
+          if (up_pad > 0) {
+            int padding_rows = std::min(
+                up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
+
+            for (int k = 0; k < padding_rows; ++k) {
+              int padding_size =
+                  k + context_length < up_pad ? context_length : up_pad - k;
+              lite::Tensor out_t_sub = out_t.Slice<float>(
+                  k * context_length, k * context_length + padding_size);
+              lite::Tensor w_sub =
+                  padding_data->Slice<float>(k, k + padding_size);
+              blas.AXPY(w_sub.numel(),
+                        static_cast<T>(1),
+                        out_t_sub.data<T>(),
+                        w_sub.data<T>());
+            }
+          }
+          if (down_pad > 0) {
+            int down_pad_begin_row =
+                std::max(
+                    0, (sequence_height - context_start - context_length) + 1) +
+                1;
+            int padding_begin = std::max(0, context_start - sequence_height);
+            int padding_size =
+                sequence_height - context_start >= context_length
+                    ? 1
+                    : context_length - (sequence_height - context_start);
+            if (context_start >= sequence_height) padding_size = context_length;
+            int padding_idx = padding_begin;
+            for (int t = 0; t + down_pad_begin_row <= sequence_height;
+                 ++t, ++padding_size) {
+              if (context_start >= sequence_height)
+                padding_size = context_length;
+              if (padding_size > context_length) {
+                padding_size = context_length;
+                padding_idx++;
+              }
+              if (padding_begin > 0 || sequence_height == context_start)
+                padding_idx = padding_begin + t;
+
+              lite::Tensor out_t_sub = out_t.Slice<float>(
+                  (down_pad_begin_row + t) * context_length - padding_size,
+                  (down_pad_begin_row + t) * context_length);
+              lite::Tensor w_sub = padding_data->Slice<float>(
+                  up_pad + padding_idx, up_pad + padding_idx + padding_size);
+              blas.AXPY(w_sub.numel(),
+                        static_cast<T>(1),
+                        out_t_sub.data<T>(),
+                        w_sub.data<T>());
+            }
+          }
+          out_t.Resize({sequence_height,
+                        static_cast<int64_t>(context_length) * sequence_width});
+        }
+      }
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv2d_transpose.cc b/lite/backends/loongarch/math/conv2d_transpose.cc
new file mode 100644
index 00000000000..2876166f491
--- /dev/null
+++ b/lite/backends/loongarch/math/conv2d_transpose.cc
@@ -0,0 +1,539 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/conv2d_transpose.h"
+#include <string.h>
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+#include "lite/backends/loongarch/xxl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+static bool is_a_ge_zero_and_a_lt_b(int a, int b) {
+  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
+}
+
+void col2im(const float* data_col,
+            const int channels,
+            const int height,
+            const int width,
+            const int kernel_h,
+            const int kernel_w,
+            const int pad_h0,
+            const int pad_h1,
+            const int pad_w0,
+            const int pad_w1,
+            const int stride_h,
+            const int stride_w,
+            const int dilation_h,
+            const int dilation_w,
+            float* data_im) {
+  memset(data_im, 0, height * width * channels * sizeof(float));
+  const int output_h =
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
+  const int output_w =
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
+  const int channel_size = height * width;
+  for (int channel = channels; channel--; data_im += channel_size) {
+    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_row = -pad_h0 + kernel_row * dilation_h;
+        for (int output_rows = output_h; output_rows; output_rows--) {
+          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+            data_col += output_w;
+          } else {
+            int input_col = -pad_w0 + kernel_col * dilation_w;
+            for (int output_col = output_w; output_col; output_col--) {
+              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                data_im[input_row * width + input_col] += *data_col;
+              }
+              data_col++;
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
+      }
+    }
+  }
+}
+
+void conv_transpose_depthwise_s1(const float* dst,
+                                 const float* weights,
+                                 const int channels,
+                                 const int height,
+                                 const int width,
+                                 const int kernel_h,
+                                 const int kernel_w,
+                                 const int pad_h0,
+                                 const int pad_h1,
+                                 const int pad_w0,
+                                 const int pad_w1,
+                                 const int dilation_h,
+                                 const int dilation_w,
+                                 float* src,
+                                 LoongArchContext* ctx) {
+  memset(src, 0, height * width * channels * sizeof(float));
+  const int output_h =
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) + 1;
+  const int output_w =
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) + 1;
+  float* zero_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kLoongArch), width * sizeof(float)));
+  memset(zero_ptr, 0, width * sizeof(float));
+  const int ic_plane_size = height * width;
+  const int oc_plane_size = output_h * output_w;
+  const int rr_plane_size = kernel_h * kernel_w;
+
+#ifdef __loongarch_asx
+  __m256 vec_zero = lasx_set1_f32(0.f);
+  __m256 vec_width = lasx_set1_f32(width * 1.0f);
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+  __m128 vec_width_128 = lsx_set1_f32(width * 1.0f);
+#endif
+
+  for (int c = 0; c < channels; c++) {
+    int dst_z = c * oc_plane_size;
+    int weight_z = c * rr_plane_size;
+    int src_z = c * ic_plane_size;
+    for (int ky = 0; ky < kernel_h; ky++) {
+      int weight_y = ky * kernel_w;
+      for (int kx = 0; kx < kernel_w; kx++) {
+        int weight_offset = weight_z + weight_y + kx;
+        const float* weight_addr = weights + weight_offset;
+        for (int ih = -pad_h0 + ky * dilation_h, oh = 0; oh < output_h;
+             ih += 4, oh += 4) {
+          int src_y = ih * width;
+          int dst_y = oh * output_w;
+          bool boundary_y0 = ((ih >= 0) && (ih < height)) && (oh < output_h);
+          bool boundary_y1 =
+              ((ih + 1) >= 0) && ((ih + 1) < height) && ((oh + 1) < output_h);
+          bool boundary_y2 =
+              ((ih + 2) >= 0) && ((ih + 2) < height) && ((oh + 2) < output_h);
+          bool boundary_y3 =
+              ((ih + 3) >= 0) && ((ih + 3) < height) && ((oh + 3) < output_h);
+          float* src_addr_h0 = boundary_y0 ? (src + src_z + src_y) : zero_ptr;
+          float* src_addr_h1 =
+              boundary_y1 ? (src + src_z + width + src_y) : zero_ptr;
+          float* src_addr_h2 =
+              boundary_y2 ? (src + src_z + width * 2 + src_y) : zero_ptr;
+          float* src_addr_h3 =
+              boundary_y3 ? (src + src_z + width * 3 + src_y) : zero_ptr;
+          int iw = -pad_w0 + kx * dilation_w;
+          int i = 0;
+
+#ifdef __loongarch_asx
+          for (; i + 7 < output_w; i += 8, iw += 8) {
+            int dst_offset = dst_z + dst_y + i;
+            const float* dst_addr = dst + dst_offset;
+            const float iw_data[8] = {iw + 0.f,
+                                      iw + 1.f,
+                                      iw + 2.f,
+                                      iw + 3.f,
+                                      iw + 4.f,
+                                      iw + 5.f,
+                                      iw + 6.f,
+                                      iw + 7.f};
+            // select weight
+            __m256 vec_iw = lasx_loadu_f32(&iw_data[0]);
+            __m256 vec_mask = lasx_and_f32(
+                lasx_cmp_f32(vec_iw, vec_zero, 13),
+                lasx_cmp_f32(vec_iw, vec_width, 1));  // GE:13  LT:1
+            __m256 vec_weight = lasx_set1_f32(weight_addr[0]);
+            vec_weight = lasx_blendv_f32(vec_zero, vec_weight, vec_mask);
+
+            // compute 4 lines
+            __m256 vec_dst = lasx_fmadd_f32(lasx_loadu_f32(dst_addr),
+                                             vec_weight,
+                                             lasx_loadu_f32(src_addr_h0 + iw));
+            lasx_storeu_f32(src_addr_h0 + iw, vec_dst);
+
+            vec_dst = lasx_fmadd_f32(lasx_loadu_f32(dst_addr + output_w),
+                                      vec_weight,
+                                      lasx_loadu_f32(src_addr_h1 + iw));
+            lasx_storeu_f32(src_addr_h1 + iw, vec_dst);
+
+            vec_dst = lasx_fmadd_f32(lasx_loadu_f32(dst_addr + 2 * output_w),
+                                      vec_weight,
+                                      lasx_loadu_f32(src_addr_h2 + iw));
+            lasx_storeu_f32(src_addr_h2 + iw, vec_dst);
+
+            vec_dst = lasx_fmadd_f32(lasx_loadu_f32(dst_addr + 3 * output_w),
+                                      vec_weight,
+                                      lasx_loadu_f32(src_addr_h3 + iw));
+            lasx_storeu_f32(src_addr_h3 + iw, vec_dst);
+          }
+#endif
+#ifdef __loongarch_sx
+          for (; i + 3 < output_w; i += 4, iw += 4) {
+            int dst_offset = dst_z + dst_y + i;
+            const float* dst_addr = dst + dst_offset;
+            const float iw_data[4] = {iw + 0.f, iw + 1.f, iw + 2.f, iw + 3.f};
+            // select weight
+            __m128 vec_iw_128 = lsx_loadu_f32(&iw_data[0]);
+            __m128 vec_mask_128 =
+                lsx_and_f32(lsx_cmpge_f32(vec_iw_128, vec_zero_128),
+                           lsx_cmplt_f32(vec_iw_128, vec_width_128));
+            __m128 vec_weight_128 = lsx_set1_f32(weight_addr[0]);
+            vec_weight_128 =
+                lsx_blendv_f32(vec_zero_128, vec_weight_128, vec_mask_128);
+
+            // compute 4 lines
+            __m128 vec_dst_128 =
+                lsx_add_f32(lsx_mul_f32(vec_weight_128, lsx_loadu_f32(dst_addr)),
+                           lsx_loadu_f32(src_addr_h0 + iw));
+            lsx_storeu_f32(src_addr_h0 + iw, vec_dst_128);
+
+            vec_dst_128 = lsx_add_f32(
+                lsx_mul_f32(vec_weight_128, lsx_loadu_f32(dst_addr + output_w)),
+                lsx_loadu_f32(src_addr_h1 + iw));
+            lsx_storeu_f32(src_addr_h1 + iw, vec_dst_128);
+
+            vec_dst_128 =
+                lsx_add_f32(lsx_mul_f32(vec_weight_128,
+                                      lsx_loadu_f32(dst_addr + 2 * output_w)),
+                           lsx_loadu_f32(src_addr_h2 + iw));
+            lsx_storeu_f32(src_addr_h2 + iw, vec_dst_128);
+
+            vec_dst_128 =
+                lsx_add_f32(lsx_mul_f32(vec_weight_128,
+                                      lsx_loadu_f32(dst_addr + 3 * output_w)),
+                           lsx_loadu_f32(src_addr_h3 + iw));
+            lsx_storeu_f32(src_addr_h3 + iw, vec_dst_128);
+          }
+#endif
+          for (; i < output_w; i++, iw++) {
+            bool boundary_x = ((iw >= 0) && (iw < width));
+            int src_offset = src_z + src_y + iw;
+            int dst_offset = dst_z + dst_y + i;
+            src[src_offset] += (boundary_x) * (boundary_y0)*dst[dst_offset] *
+                               weights[weight_offset];
+            src[src_offset + width] +=
+                (boundary_x) * (boundary_y1)*dst[dst_offset + output_w] *
+                weights[weight_offset];
+            src[src_offset + width * 2] +=
+                (boundary_x) * (boundary_y2)*dst[dst_offset + output_w * 2] *
+                weights[weight_offset];
+            src[src_offset + width * 3] +=
+                (boundary_x) * (boundary_y3)*dst[dst_offset + output_w * 3] *
+                weights[weight_offset];
+          }
+        }
+      }
+    }
+  }
+  TargetFree(TARGET(kLoongArch), zero_ptr);
+}
+
+void conv_transpose_depthwise_s2(const float* dst,
+                                 const float* weights,
+                                 const int channels,
+                                 const int height,
+                                 const int width,
+                                 const int kernel_h,
+                                 const int kernel_w,
+                                 const int pad_h0,
+                                 const int pad_h1,
+                                 const int pad_w0,
+                                 const int pad_w1,
+                                 const int dilation_h,
+                                 const int dilation_w,
+                                 float* src,
+                                 LoongArchContext* ctx) {
+  memset(src, 0, height * width * channels * sizeof(float));
+  const int output_h =
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) / 2 + 1;
+  const int output_w =
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / 2 + 1;
+  float* zero_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kLoongArch), width * sizeof(float)));
+  memset(zero_ptr, 0, width * sizeof(float));
+  const int ic_plane_size = height * width;
+  const int oc_plane_size = output_h * output_w;
+  const int rr_plane_size = kernel_h * kernel_w;
+
+#ifdef __loongarch_asx
+  __m256 vec_zero = lasx_set1_f32(0.f);
+  __m256 vec_width = lasx_set1_f32(width * 1.0f);
+  const int mask_store[8] = {-1, 0, -1, 0, -1, 0, -1, 0};
+  __m256i vec_store_mask = lasx_loadu_m256i((const __m256i*)&mask_store[0]);
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+  __m128 vec_width_128 = lsx_set1_f32(width * 1.0f);
+#endif
+
+  for (int c = 0; c < channels; c++) {
+    int dst_z = c * oc_plane_size;
+    int weight_z = c * rr_plane_size;
+    int src_z = c * ic_plane_size;
+    for (int ky = 0; ky < kernel_h; ky++) {
+      int weight_y = ky * kernel_w;
+      for (int kx = 0; kx < kernel_w; kx++) {
+        int weight_offset = weight_z + weight_y + kx;
+        const float* weight_addr = weights + weight_offset;
+        for (int ih = -pad_h0 + ky * dilation_h, oh = 0; oh < output_h;
+             ih += 8, oh += 4) {
+          int src_y = ih * width;
+          int dst_y = oh * output_w;
+          bool boundary_y0 = ((ih >= 0) && (ih < height)) && (oh < output_h);
+          bool boundary_y1 =
+              ((ih + 2) >= 0) && ((ih + 2) < height) && ((oh + 1) < output_h);
+          bool boundary_y2 =
+              ((ih + 4) >= 0) && ((ih + 4) < height) && ((oh + 2) < output_h);
+          bool boundary_y3 =
+              ((ih + 6) >= 0) && ((ih + 6) < height) && ((oh + 3) < output_h);
+          float* src_addr_h0 = boundary_y0 ? (src + src_z + src_y) : zero_ptr;
+          float* src_addr_h1 =
+              boundary_y1 ? (src + src_z + width * 2 + src_y) : zero_ptr;
+          float* src_addr_h2 =
+              boundary_y2 ? (src + src_z + width * 4 + src_y) : zero_ptr;
+          float* src_addr_h3 =
+              boundary_y3 ? (src + src_z + width * 6 + src_y) : zero_ptr;
+          int iw = -pad_w0 + kx * dilation_w;
+          int i = 0;
+
+#ifdef __loongarch_asx
+          for (; i + 7 < output_w; i += 8, iw += 16) {
+            int dst_offset = dst_z + dst_y + i;
+            const float* dst_addr = dst + dst_offset;
+            const float iw_data[8] = {iw + 0.f,
+                                      iw + 2.f,
+                                      iw + 4.f,
+                                      iw + 6.f,
+                                      iw + 8.f,
+                                      iw + 10.f,
+                                      iw + 12.f,
+                                      iw + 14.f};
+
+            // select weight
+            __m256 vec_iw = lasx_loadu_f32(&iw_data[0]);
+            __m256 vec_mask = lasx_and_f32(
+                lasx_cmp_f32(vec_iw, vec_zero, 13),
+                lasx_cmp_f32(vec_iw, vec_width, 1));  // GE:13  LT:1
+            __m256 vec_weight = lasx_set1_f32(weight_addr[0]);
+            vec_weight = lasx_blendv_f32(vec_zero, vec_weight, vec_mask);
+
+            // compute 4 lines
+            __m256 vec_data_lo = lasx_loadu_f32(src_addr_h0 + iw);
+            __m256 vec_data_hi = lasx_loadu_f32(src_addr_h0 + iw + 8);
+            __m256 vec_data =
+                lasx_shuffle_f32(vec_data_lo, vec_data_hi, 136);  // 0x88
+            __m256i vec_tmp_data =
+                lasx_permute4x64_i64(lasx_castf32_m256i(vec_data),
+                                         216);  // 11011000b
+            vec_data = lasx_castm256i_f32(vec_tmp_data);
+            __m256 vec_dst = lasx_fmadd_f32(
+                lasx_loadu_f32(dst_addr), vec_weight, vec_data);
+            __m256 vec_dst_lo = lasx_unpacklo_f32(vec_dst, vec_zero);
+            __m256 vec_dst_hi = lasx_unpackhi_f32(vec_dst, vec_zero);
+            lasx_maskstore_f32(
+                src_addr_h0 + iw,
+                vec_store_mask,
+                lasx_permute2f128_f32(vec_dst_lo, vec_dst_hi, 0x20));
+            lasx_maskstore_f32(
+                src_addr_h0 + iw + 8,
+                vec_store_mask,
+                lasx_permute2f128_f32(vec_dst_lo, vec_dst_hi, 0x31));
+
+            vec_data_lo = lasx_loadu_f32(src_addr_h1 + iw);
+            vec_data_hi = lasx_loadu_f32(src_addr_h1 + iw + 8);
+            vec_data =
+                lasx_shuffle_f32(vec_data_lo, vec_data_hi, 136);  // 0x88
+            vec_tmp_data =
+                lasx_permute4x64_i64(lasx_castf32_m256i(vec_data),
+                                         216);  // 11011000b
+            vec_data = lasx_castm256i_f32(vec_tmp_data);
+
+            vec_dst = lasx_fmadd_f32(
+                lasx_loadu_f32(dst_addr + output_w), vec_weight, vec_data);
+            vec_dst_lo = lasx_unpacklo_f32(vec_dst, vec_zero);
+            vec_dst_hi = lasx_unpackhi_f32(vec_dst, vec_zero);
+            lasx_maskstore_f32(
+                src_addr_h1 + iw,
+                vec_store_mask,
+                lasx_permute2f128_f32(vec_dst_lo, vec_dst_hi, 0x20));
+            lasx_maskstore_f32(
+                src_addr_h1 + iw + 8,
+                vec_store_mask,
+                lasx_permute2f128_f32(vec_dst_lo, vec_dst_hi, 0x31));
+
+            vec_data_lo = lasx_loadu_f32(src_addr_h2 + iw);
+            vec_data_hi = lasx_loadu_f32(src_addr_h2 + iw + 8);
+            vec_data =
+                lasx_shuffle_f32(vec_data_lo, vec_data_hi, 136);  // 0x88
+            vec_tmp_data =
+                lasx_permute4x64_i64(lasx_castf32_m256i(vec_data),
+                                         216);  // 11011000b
+            vec_data = lasx_castm256i_f32(vec_tmp_data);
+            vec_dst = lasx_fmadd_f32(
+                lasx_loadu_f32(dst_addr + 2 * output_w), vec_weight, vec_data);
+            vec_dst_lo = lasx_unpacklo_f32(vec_dst, vec_zero);
+            vec_dst_hi = lasx_unpackhi_f32(vec_dst, vec_zero);
+            lasx_maskstore_f32(
+                src_addr_h2 + iw,
+                vec_store_mask,
+                lasx_permute2f128_f32(vec_dst_lo, vec_dst_hi, 0x20));
+            lasx_maskstore_f32(
+                src_addr_h2 + iw + 8,
+                vec_store_mask,
+                lasx_permute2f128_f32(vec_dst_lo, vec_dst_hi, 0x31));
+
+            vec_data_lo = lasx_loadu_f32(src_addr_h3 + iw);
+            vec_data_hi = lasx_loadu_f32(src_addr_h3 + iw + 8);
+            vec_data =
+                lasx_shuffle_f32(vec_data_lo, vec_data_hi, 136);  // 0x88
+            vec_tmp_data =
+                lasx_permute4x64_i64(lasx_castf32_m256i(vec_data),
+                                         216);  // 11011000b
+            vec_data = lasx_castm256i_f32(vec_tmp_data);
+            vec_dst = lasx_fmadd_f32(
+                lasx_loadu_f32(dst_addr + 3 * output_w), vec_weight, vec_data);
+            vec_dst_lo = lasx_unpacklo_f32(vec_dst, vec_zero);
+            vec_dst_hi = lasx_unpackhi_f32(vec_dst, vec_zero);
+            lasx_maskstore_f32(
+                src_addr_h3 + iw,
+                vec_store_mask,
+                lasx_permute2f128_f32(vec_dst_lo, vec_dst_hi, 0x20));
+            lasx_maskstore_f32(
+                src_addr_h3 + iw + 8,
+                vec_store_mask,
+                lasx_permute2f128_f32(vec_dst_lo, vec_dst_hi, 0x31));
+          }
+#endif
+#ifdef __loongarch_sx
+          for (; i + 3 < output_w; i += 4, iw += 8) {
+            int dst_offset = dst_z + dst_y + i;
+            const float* dst_addr = dst + dst_offset;
+            const float iw_data[4] = {iw + 0.f, iw + 2.f, iw + 4.f, iw + 6.f};
+
+            // select weight
+            __m128 vec_iw_128 = lsx_loadu_f32(&iw_data[0]);
+            __m128 vec_mask_128 =
+                lsx_and_f32(lsx_cmpge_f32(vec_iw_128, vec_zero_128),
+                           lsx_cmplt_f32(vec_iw_128, vec_width_128));
+            __m128 vec_weight_128 = lsx_set1_f32(weight_addr[0]);
+            vec_weight_128 =
+                lsx_blendv_f32(vec_zero_128, vec_weight_128, vec_mask_128);
+
+            // compute 4 lines
+            __m128 vec_data_lo128 = lsx_loadu_f32(src_addr_h0 + iw);
+            __m128 vec_data_hi128 = lsx_loadu_f32(src_addr_h0 + iw + 4);
+            __m128 vec_data_128 =
+                lsx_shuffle_f32(vec_data_lo128, vec_data_hi128, 136);  // 0x88
+            __m128 vec_dst_128 =
+                lsx_add_f32(lsx_mul_f32(lsx_loadu_f32(dst_addr), vec_weight_128),
+                           vec_data_128);
+            lsx_storeu_f32(
+                src_addr_h0 + iw,
+                lsx_blend_f32(vec_data_lo128,
+                             lsx_unpacklo_f32(vec_dst_128, vec_zero_128),
+                             5));
+            lsx_storeu_f32(
+                src_addr_h0 + iw + 4,
+                lsx_blend_f32(vec_data_hi128,
+                             lsx_unpackhi_f32(vec_dst_128, vec_zero_128),
+                             5));
+
+            vec_data_lo128 = lsx_loadu_f32(src_addr_h1 + iw);
+            vec_data_hi128 = lsx_loadu_f32(src_addr_h1 + iw + 4);
+            vec_data_128 =
+                lsx_shuffle_f32(vec_data_lo128, vec_data_hi128, 136);  // 0x88
+            vec_dst_128 = lsx_add_f32(
+                lsx_mul_f32(lsx_loadu_f32(dst_addr + output_w), vec_weight_128),
+                vec_data_128);
+            lsx_storeu_f32(
+                src_addr_h1 + iw,
+                lsx_blend_f32(vec_data_lo128,
+                             lsx_unpacklo_f32(vec_dst_128, vec_zero_128),
+                             5));
+            lsx_storeu_f32(
+                src_addr_h1 + iw + 4,
+                lsx_blend_f32(vec_data_hi128,
+                             lsx_unpackhi_f32(vec_dst_128, vec_zero_128),
+                             5));
+
+            vec_data_lo128 = lsx_loadu_f32(src_addr_h2 + iw);
+            vec_data_hi128 = lsx_loadu_f32(src_addr_h2 + iw + 4);
+            vec_data_128 =
+                lsx_shuffle_f32(vec_data_lo128, vec_data_hi128, 136);  // 0x88
+            vec_dst_128 =
+                lsx_add_f32(lsx_mul_f32(lsx_loadu_f32(dst_addr + 2 * output_w),
+                                      vec_weight_128),
+                           vec_data_128);
+            lsx_storeu_f32(
+                src_addr_h2 + iw,
+                lsx_blend_f32(vec_data_lo128,
+                             lsx_unpacklo_f32(vec_dst_128, vec_zero_128),
+                             5));
+            lsx_storeu_f32(
+                src_addr_h2 + iw + 4,
+                lsx_blend_f32(vec_data_hi128,
+                             lsx_unpackhi_f32(vec_dst_128, vec_zero_128),
+                             5));
+
+            vec_data_lo128 = lsx_loadu_f32(src_addr_h3 + iw);
+            vec_data_hi128 = lsx_loadu_f32(src_addr_h3 + iw + 4);
+            vec_data_128 =
+                lsx_shuffle_f32(vec_data_lo128, vec_data_hi128, 136);  // 0x88
+            vec_dst_128 =
+                lsx_add_f32(lsx_mul_f32(lsx_loadu_f32(dst_addr + 3 * output_w),
+                                      vec_weight_128),
+                           vec_data_128);
+            lsx_storeu_f32(
+                src_addr_h3 + iw,
+                lsx_blend_f32(vec_data_lo128,
+                             lsx_unpacklo_f32(vec_dst_128, vec_zero_128),
+                             5));
+            lsx_storeu_f32(
+                src_addr_h3 + iw + 4,
+                lsx_blend_f32(vec_data_hi128,
+                             lsx_unpackhi_f32(vec_dst_128, vec_zero_128),
+                             5));
+          }
+#endif
+          for (; i < output_w; i++, iw += 2) {
+            bool boundary_x = ((iw >= 0) && (iw < width));
+            int src_offset = src_z + src_y + iw;
+            int dst_offset = dst_z + dst_y + i;
+            src[src_offset] += (boundary_x) * (boundary_y0)*dst[dst_offset] *
+                               weights[weight_offset];
+            src[src_offset + width * 2] +=
+                (boundary_x) * (boundary_y1)*dst[dst_offset + output_w] *
+                weights[weight_offset];
+            src[src_offset + width * 4] +=
+                (boundary_x) * (boundary_y2)*dst[dst_offset + output_w * 2] *
+                weights[weight_offset];
+            src[src_offset + width * 6] +=
+                (boundary_x) * (boundary_y3)*dst[dst_offset + output_w * 3] *
+                weights[weight_offset];
+          }
+        }
+      }
+    }
+  }
+  TargetFree(TARGET(kLoongArch), zero_ptr);
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv2d_transpose.h b/lite/backends/loongarch/math/conv2d_transpose.h
new file mode 100644
index 00000000000..035ea9b7fdc
--- /dev/null
+++ b/lite/backends/loongarch/math/conv2d_transpose.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void col2im(const float* data_col,
+            const int channels,
+            const int height,
+            const int width,
+            const int kernel_h,
+            const int kernel_w,
+            const int pad_h0,
+            const int pad_h1,
+            const int pad_w0,
+            const int pad_w1,
+            const int stride_h,
+            const int stride_w,
+            const int dilation_h,
+            const int dilation_w,
+            float* data_im);
+
+void fill_bias_act(float* tensor,
+                   const float* bias,
+                   int channel,
+                   int channel_size,
+                   bool flag_bias,
+                   const operators::ActivationParam* act_param);
+
+void conv_transpose_depthwise_s1(const float* dst,
+                                 const float* weights,
+                                 const int channels,
+                                 const int height,
+                                 const int width,
+                                 const int kernel_h,
+                                 const int kernel_w,
+                                 const int pad_h0,
+                                 const int pad_h1,
+                                 const int pad_w0,
+                                 const int pad_w1,
+                                 const int dilation_h,
+                                 const int dilation_w,
+                                 float* src,
+                                 LoongArchContext* ctx);
+
+void conv_transpose_depthwise_s2(const float* dst,
+                                 const float* weights,
+                                 const int channels,
+                                 const int height,
+                                 const int width,
+                                 const int kernel_h,
+                                 const int kernel_w,
+                                 const int pad_h0,
+                                 const int pad_h1,
+                                 const int pad_w0,
+                                 const int pad_w1,
+                                 const int dilation_h,
+                                 const int dilation_w,
+                                 float* src,
+                                 LoongArchContext* ctx);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv3x3s1_depthwise_int8.cc b/lite/backends/loongarch/math/conv3x3s1_depthwise_int8.cc
new file mode 100644
index 00000000000..7710558ae84
--- /dev/null
+++ b/lite/backends/loongarch/math/conv3x3s1_depthwise_int8.cc
@@ -0,0 +1,998 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __loongarch_asx
+
+#include <vector>
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+#include "lite/backends/loongarch/math/conv_depthwise_int8.h"
+#include "lite/backends/loongarch/math/saturate.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+#define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
+#define DATA_PACK(                                                          \
+    vzero0, vin_00, vin_10, vzero1, vin_01, vin_11, vzero2, vin_02, vin_12) \
+  __m128i va, vb, vc;                                                       \
+  transpose3x4_4x4_epi(vzero0, vin_00, vin_10, va); /* 0 3 6 9 */           \
+  transpose3x4_4x4_epi(vzero1, vin_01, vin_11, vb); /* 1 4 7 10 */          \
+  transpose3x4_4x4_epi(vzero2, vin_02, vin_12, vc);                         \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr), vzero0);              \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 16), vzero1);         \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 32), vzero2);         \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 48), vin_00);         \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 64), vin_01);         \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 80), vin_02);         \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 96), vin_10);         \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 112), vin_11);        \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 128), vin_12);        \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 144), va);            \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 160), vb);            \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(doutr + 176), vc);
+
+#define RIGHT_PROCESS(dr0, dr1, dr2, doutr) \
+  for (; w < win_new - 2; w++) {            \
+    *doutr++ = dr0[0];                      \
+    *doutr++ = dr0[1];                      \
+    *doutr++ = dr0[2];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = dr1[0];                      \
+    *doutr++ = dr1[1];                      \
+    *doutr++ = dr1[2];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = dr2[0];                      \
+    *doutr++ = dr2[1];                      \
+    *doutr++ = dr2[2];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    dr0++;                                  \
+    dr1++;                                  \
+    dr2++;                                  \
+  }                                         \
+  if (w == win_new - 2) {                   \
+    *doutr++ = dr0[0];                      \
+    *doutr++ = dr0[1];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = dr1[0];                      \
+    *doutr++ = dr1[1];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = dr2[0];                      \
+    *doutr++ = dr2[1];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    dr0++;                                  \
+    dr1++;                                  \
+    dr2++;                                  \
+  }                                         \
+  if (w == win_new - 1) {                   \
+    *doutr++ = dr0[0];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = dr1[0];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = dr2[0];                      \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    *doutr++ = 0;                           \
+    dr0++;                                  \
+    dr1++;                                  \
+    dr2++;                                  \
+  }
+
+#define LEFT_PROCESS(dr0, dr1, dr2, doutr) \
+  if (win_new >= 2) {                      \
+    *doutr++ = 0;                          \
+    *doutr++ = dr0[0];                     \
+    *doutr++ = dr0[1];                     \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = dr1[0];                     \
+    *doutr++ = dr1[1];                     \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = dr2[0];                     \
+    *doutr++ = dr2[1];                     \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    w++;                                   \
+  } else {                                 \
+    *doutr++ = 0;                          \
+    *doutr++ = dr0[0];                     \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = dr1[0];                     \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = dr2[0];                     \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    *doutr++ = 0;                          \
+    w++;                                   \
+  }
+#define LEFT_PROCESS_MORE(dr0, dr1, dr2, doutr) \
+  for (; w < pad_w - 2; w++) {                  \
+    memset(doutr, 0, sizeof(int8_t) * 16);      \
+    doutr += 16;                                \
+  }                                             \
+  /* pad_w = 2 */                               \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  *doutr++ = dr0[0];                            \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  *doutr++ = dr1[0];                            \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  *doutr++ = dr2[0];                            \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  *doutr++ = 0;                                 \
+  w++;                                          \
+  /* pad_w = 1 */                               \
+  LEFT_PROCESS(dr0, dr1, dr2, doutr)
+#define MID_PROCESS(dr0, dr1, dr2, doutr)                                    \
+  for (; w < win_new - 14; w += 12) {                                        \
+    __m128i vin_r0 = lsx_loadu_m128i(reinterpret_cast<__m128i const*>(dr0)); \
+    __m128i vin_r1 = lsx_loadu_m128i(reinterpret_cast<__m128i const*>(dr1)); \
+    __m128i vin_r2 = lsx_loadu_m128i(reinterpret_cast<__m128i const*>(dr2)); \
+    /* 01234567->12345678 */                                                 \
+    __m128i vin_r01 = lsx_shuffle_i8(vin_r0, vb1);                         \
+    __m128i vin_r11 = lsx_shuffle_i8(vin_r1, vb1);                         \
+    __m128i vin_r21 = lsx_shuffle_i8(vin_r2, vb1);                         \
+    /* 01234567->23456789 */                                                 \
+    __m128i vin_r02 = lsx_shuffle_i8(vin_r0, vb2);                         \
+    __m128i vin_r12 = lsx_shuffle_i8(vin_r1, vb2);                         \
+    __m128i vin_r22 = lsx_shuffle_i8(vin_r2, vb2);                         \
+    /* 01234567->012a 345a 678a */                                           \
+    __m128i vin_00 = lsx_shuffle_i8(vin_r0, vmask);                        \
+    __m128i vin_10 = lsx_shuffle_i8(vin_r1, vmask);                        \
+    __m128i vin_20 = lsx_shuffle_i8(vin_r2, vmask);                        \
+    /* 12345678-> 123a 456a 789a */                                          \
+    __m128i vin_01 = lsx_shuffle_i8(vin_r01, vmask);                       \
+    __m128i vin_11 = lsx_shuffle_i8(vin_r11, vmask);                       \
+    __m128i vin_21 = lsx_shuffle_i8(vin_r21, vmask);                       \
+    /* 23456789-> 234a 567a 8910a */                                         \
+    __m128i vin_02 = lsx_shuffle_i8(vin_r02, vmask);                       \
+    __m128i vin_12 = lsx_shuffle_i8(vin_r12, vmask);                       \
+    __m128i vin_22 = lsx_shuffle_i8(vin_r22, vmask);                       \
+    /* a0b0c0d0, a1b1c1d1 -> a0a1b0b1c0d0d0d1 */                             \
+    DATA_PACK(vin_00,                                                        \
+              vin_10,                                                        \
+              vin_20,                                                        \
+              vin_01,                                                        \
+              vin_11,                                                        \
+              vin_21,                                                        \
+              vin_02,                                                        \
+              vin_12,                                                        \
+              vin_22)                                                        \
+    dr0 += 12;                                                               \
+    dr1 += 12;                                                               \
+    dr2 += 12;                                                               \
+    doutr += 192;                                                            \
+  }
+#define MID_PROCESS_PAD_1(dr0, dr1, doutr)                                 \
+  __m128i vzero0 = lsx_set1_i8(0);                                       \
+  __m128i vzero1 = lsx_set1_i8(0);                                       \
+  __m128i vzero2 = lsx_set1_i8(0);                                       \
+  __m128i vin_r0 = lsx_loadu_m128i(reinterpret_cast<__m128i const*>(dr0)); \
+  __m128i vin_r1 = lsx_loadu_m128i(reinterpret_cast<__m128i const*>(dr1)); \
+  /* 01234567->12345678 */                                                 \
+  __m128i vin_r01 = lsx_shuffle_i8(vin_r0, vb1);                         \
+  __m128i vin_r11 = lsx_shuffle_i8(vin_r1, vb1);                         \
+  /* 01234567->23456789 */                                                 \
+  __m128i vin_r02 = lsx_shuffle_i8(vin_r0, vb2);                         \
+  __m128i vin_r12 = lsx_shuffle_i8(vin_r1, vb2);                         \
+  /* 01234567->012a 345a 678a */                                           \
+  __m128i vin_00 = lsx_shuffle_i8(vin_r0, vmask);                        \
+  __m128i vin_10 = lsx_shuffle_i8(vin_r1, vmask);                        \
+  /* 12345678-> 123a 456a 789a */                                          \
+  __m128i vin_01 = lsx_shuffle_i8(vin_r01, vmask);                       \
+  __m128i vin_11 = lsx_shuffle_i8(vin_r11, vmask);                       \
+  /* 23456789-> 234a 567a 8910a */                                         \
+  __m128i vin_02 = lsx_shuffle_i8(vin_r02, vmask);                       \
+  __m128i vin_12 = lsx_shuffle_i8(vin_r12, vmask);
+
+#define TOP_MID_PAD_1                                                         \
+  /* a0b0c0d0, a1b1c1d1 -> a0a1b0b1c0d0d0d1 */                                \
+  DATA_PACK(                                                                  \
+      vzero0, vin_00, vin_10, vzero1, vin_01, vin_11, vzero2, vin_02, vin_12) \
+  dr0 += 12;                                                                  \
+  dr1 += 12;                                                                  \
+  doutr += 192;
+
+#define BOT_MID_PAD_1                                                         \
+  /* a0b0c0d0, a1b1c1d1 -> a0a1b0b1c0d0d0d1 */                                \
+  DATA_PACK(                                                                  \
+      vin_00, vin_10, vzero0, vin_01, vin_11, vzero1, vin_02, vin_12, vzero2) \
+  dr0 += 12;                                                                  \
+  dr1 += 12;                                                                  \
+  doutr += 192;
+
+#define MID_PROCESS_PAD_2(dr0, doutr)                                      \
+  __m128i vzero0 = lsx_set1_i8(0);                                       \
+  __m128i vzero1 = lsx_set1_i8(0);                                       \
+  __m128i vzero2 = lsx_set1_i8(0);                                       \
+  __m128i vin_r0 = lsx_loadu_m128i(reinterpret_cast<__m128i const*>(dr0)); \
+  __m128i vin_10 = lsx_set1_i8(0);                                       \
+  __m128i vin_11 = lsx_set1_i8(0);                                       \
+  __m128i vin_12 = lsx_set1_i8(0);                                       \
+  /* 01234567->12345678  */                                                \
+  __m128i vin_r01 = lsx_shuffle_i8(vin_r0, vb1);                         \
+  /* 01234567->23456789 */                                                 \
+  __m128i vin_r02 = lsx_shuffle_i8(vin_r0, vb2);                         \
+  /* 01234567->012a 345a 678a */                                           \
+  __m128i vin_00 = lsx_shuffle_i8(vin_r0, vmask);                        \
+  /* 12345678-> 123a 456a 789a */                                          \
+  __m128i vin_01 = lsx_shuffle_i8(vin_r01, vmask);                       \
+  /* 23456789-> 234a 567a 8910a */                                         \
+  __m128i vin_02 = lsx_shuffle_i8(vin_r02, vmask);
+
+#define TOP_MID_PAD_2                                                         \
+  /* a0b0c0d0, a1b1c1d1 -> a0a1b0b1c0d0d0d1 */                                \
+  DATA_PACK(                                                                  \
+      vzero0, vin_00, vin_10, vzero1, vin_01, vin_11, vzero2, vin_02, vin_12) \
+  dr0 += 12;                                                                  \
+  doutr += 192;
+
+#define BOT_MID_PAD_2                                                         \
+  /* a0b0c0d0, a1b1c1d1 -> a0a1b0b1c0d0d0d1 */                                \
+  DATA_PACK(                                                                  \
+      vin_00, vin_10, vzero0, vin_01, vin_11, vzero1, vin_02, vin_12, vzero2) \
+  dr0 += 12;                                                                  \
+  doutr += 192;
+
+// a0b0c0d0 a1b1c1d1 a2b2c2d2 -> a0a1a20 b0b1b20 c0c1c20 d0d1d20
+inline void transpose3x4_4x4_epi(__m128i& row0,  // NOLINT
+                                 __m128i& row1,  // NOLINT
+                                 __m128i& row2,  // NOLINT
+                                 __m128i& row3   // NOLINT
+                                 ) {
+  __m128i tmp0 = lsx_unpacklo_i32(row0, row1);  // a0a1b0b1
+  __m128i tmp1 = lsx_unpackhi_i32(row0, row1);  // c0c1d0d1
+  // int32 -> fp32
+  __m128 v0 = lsx_cvti32_f32(row2);  // a2b2c2d2
+  __m128 v1 = lsx_cvti32_f32(tmp0);  // a0a1b0b1
+  __m128 v2 = lsx_cvti32_f32(tmp1);  // c0c1d0d1
+  // a0a1a2b2
+  __m128 v00 = lsx_shuffle_f32(v1, v0, 0x44);
+  // b0b1b2c2
+  __m128 v01 = lsx_shuffle_f32(v1, v0, 0x9e);  // [10, 01, 11, 10]
+  // c0c1c2d2
+  __m128 v02 = lsx_shuffle_f32(v2, v0, 0xe4);  // [11, 10, 01, 00]
+  // d0d1c2d2
+  __m128 v03 = lsx_shuffle_f32(v2, v0, 0xee);  // [11, 10, 11, 10]
+  // fp32 -> int32
+  row0 = lsx_cvtf32_i32(v00);
+  row1 = lsx_cvtf32_i32(v01);
+  row2 = lsx_cvtf32_i32(v02);
+  row3 = lsx_cvtf32_i32(v03);
+  // d0d1d2d2
+  row3 = lsx_shuffle_i32(row3, 0xf4);  // [11, 11, 01, 00]
+}
+
+void prepack_input_im2col_s1_int8(const int8_t* din,
+                                  int8_t* dout,
+                                  int pad_w,
+                                  int pad_h,
+                                  int win,
+                                  int hin,
+                                  int win_round,
+                                  int hin_round) {
+  int h = 0;
+  int8_t* dout_ptr = dout;
+  const int8_t* din_ptr = din;
+  int win_new = win + pad_w;
+  int hin_new = hin + pad_h;
+  __m128i vb1 =
+      lsx_set_i8(-127, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+  __m128i vb2 =
+      lsx_set_i8(-127, -127, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2);
+  __m128i vmask = lsx_set_i8(
+      -127, 11, 10, 9, -127, 8, 7, 6, -127, 5, 4, 3, -127, 2, 1, 0);
+  int8_t zero_ptr[32];
+  memset(zero_ptr, 0, sizeof(int8_t) * 32);
+  if (pad_w == 0) {
+    // top
+    if (pad_h == 1) {  // top only support pad_h = 0 or 1
+      int w = 0;
+      const int8_t* dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      int8_t* doutr = dout_ptr;
+      // mid-cnt
+      if (hin >= 2) {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_1(dr0, dr1, doutr)
+          TOP_MID_PAD_1
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, dr1, doutr)
+        }
+      } else {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_2(dr0, doutr)
+          TOP_MID_PAD_2
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, tmp_ptr, doutr)
+        }
+      }
+      h++;
+      dout_ptr += win_round;
+    } else if (pad_h > 1) {
+      for (; h < pad_h - 2; h++) {
+        memset(dout_ptr, 0, sizeof(int8_t) * win_round);
+        dout_ptr += win_round;
+      }
+      // pad_h = 2
+      int w = 0;
+      const int8_t* dr0 = din_ptr;
+      int8_t* doutr = dout_ptr;
+      for (; w < win_new - 14; w += 12) {
+        MID_PROCESS_PAD_2(dr0, doutr)
+        TOP_MID_PAD_2
+      }
+      if (w < win_new) {
+        auto tmp_ptr = zero_ptr;
+        RIGHT_PROCESS(tmp_ptr, tmp_ptr, dr0, doutr)
+      }
+      h++;
+      dout_ptr += win_round;
+      // pad_h = 1
+      w = 0;
+      dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      doutr = dout_ptr;
+      // mid-cnt
+      if (hin >= 2) {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_1(dr0, dr1, doutr)
+          TOP_MID_PAD_1
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, dr1, doutr)
+        }
+      } else {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_2(dr0, doutr)
+          TOP_MID_PAD_2
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, tmp_ptr, doutr)
+        }
+      }
+      h++;
+      dout_ptr += win_round;
+    }
+    // mid
+    for (; h < hin_round && h < hin_new - 2; h++) {
+      const int8_t* dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      const int8_t* dr2 = dr1 + win;
+      int8_t* doutr = dout_ptr;
+      int w = 0;
+      MID_PROCESS(dr0, dr1, dr2, doutr)
+      RIGHT_PROCESS(dr0, dr1, dr2, doutr)
+      din_ptr += win;
+      dout_ptr += win_round;
+    }
+    // bottom
+    if (h < hin_round) {  // bottom
+      const int8_t* dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      int8_t* doutr = dout_ptr;
+      int w = 0;
+      if (h == hin_new - 2) {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_1(dr0, dr1, doutr)
+          BOT_MID_PAD_1
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(dr0, dr1, tmp_ptr, doutr)
+        }
+      }
+      if (h == hin_new - 1) {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_2(dr0, doutr)
+          BOT_MID_PAD_2
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(dr0, tmp_ptr, tmp_ptr, doutr)
+        }
+      }
+    }
+  } else if (pad_w == 1) {
+    const int8_t* dr0 = din_ptr;
+    const int8_t* dr1 = din_ptr + win;
+    int8_t* doutr = dout_ptr;
+    int w = 0;
+    if (pad_h == 1) {
+      auto tmp_ptr = zero_ptr;
+      if (hin >= 2) {
+        LEFT_PROCESS(tmp_ptr, dr0, dr1, doutr);
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_1(dr0, dr1, doutr)
+          TOP_MID_PAD_1
+        }
+        if (w < win_new) {
+          tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, dr1, doutr)
+        }
+      } else {
+        LEFT_PROCESS(tmp_ptr, dr0, tmp_ptr, doutr);
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_2(dr0, doutr)
+          TOP_MID_PAD_2
+        }
+        if (w < win_new) {
+          tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, tmp_ptr, doutr)
+        }
+      }
+      h++;
+      dout_ptr += win_round;
+    } else if (pad_h > 1) {
+      for (; h < pad_h - 2; h++) {
+        memset(dout_ptr, 0, sizeof(int8_t) * win_round);
+        dout_ptr += win_round;
+      }
+      // pad_h = 2
+      int w = 0;
+      const int8_t* dr0 = din_ptr;
+      int8_t* doutr = dout_ptr;
+      auto tmp_ptr = zero_ptr;
+      LEFT_PROCESS(tmp_ptr, tmp_ptr, dr0, doutr);
+      for (; w < win_new - 14; w += 12) {
+        MID_PROCESS_PAD_2(dr0, doutr)
+        TOP_MID_PAD_2
+      }
+      if (w < win_new) {
+        auto tmp_ptr = zero_ptr;
+        RIGHT_PROCESS(tmp_ptr, tmp_ptr, dr0, doutr)
+      }
+      h++;
+      dout_ptr += win_round;
+      // pad_h = 1
+      w = 0;
+      dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      doutr = dout_ptr;
+      tmp_ptr = zero_ptr;
+      LEFT_PROCESS(tmp_ptr, dr0, dr1, doutr);
+      // mid-cnt
+      if (hin >= 2) {
+        MID_PROCESS_PAD_1(dr0, dr1, doutr)
+        TOP_MID_PAD_1
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, dr1, doutr)
+        }
+      } else {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_2(dr0, doutr)
+          TOP_MID_PAD_2
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, tmp_ptr, doutr)
+        }
+      }
+      h++;
+      dout_ptr += win_round;
+    }
+    // mid
+    for (; h < hin_round && h < hin_new - 2; h++) {
+      const int8_t* dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      const int8_t* dr2 = dr1 + win;
+      int8_t* doutr = dout_ptr;
+      w = 0;
+      LEFT_PROCESS(dr0, dr1, dr2, doutr)
+      MID_PROCESS(dr0, dr1, dr2, doutr)
+      RIGHT_PROCESS(dr0, dr1, dr2, doutr)
+      din_ptr += win;
+      dout_ptr += win_round;
+    }
+    // bottom
+    if (h < hin_round) {  // bottom
+      const int8_t* dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      int8_t* doutr = dout_ptr;
+      auto tmp_ptr0 = zero_ptr;
+      int w = 0;
+      if (h == hin_new - 2) {
+        LEFT_PROCESS(dr0, dr1, tmp_ptr0, doutr)
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_1(dr0, dr1, doutr)
+          BOT_MID_PAD_1
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(dr0, dr1, tmp_ptr, doutr)
+        }
+      }
+      if (h == hin_new - 1) {
+        LEFT_PROCESS(dr0, tmp_ptr0, tmp_ptr0, doutr)
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_2(dr0, doutr)
+          BOT_MID_PAD_2
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(dr0, tmp_ptr, tmp_ptr, doutr)
+        }
+      }
+    }
+  } else {
+    const int8_t* dr0 = din_ptr;
+    const int8_t* dr1 = din_ptr + win;
+    int8_t* doutr = dout_ptr;
+    int w = 0;
+    auto tmp_ptr = zero_ptr;
+    if (pad_h == 1) {
+      if (h > hin - 1) {
+        LEFT_PROCESS_MORE(tmp_ptr, dr0, dr1, doutr)
+      } else {
+        LEFT_PROCESS_MORE(tmp_ptr, tmp_ptr, dr0, doutr)
+      }
+    } else if (pad_h > 1) {
+      for (; h < pad_h - 2; h++) {
+        memset(dout_ptr, 0, sizeof(int8_t) * win_round);
+        dout_ptr += win_round;
+      }
+      // pad_h = 2
+      int w = 0;
+      const int8_t* dr0 = din_ptr;
+      int8_t* doutr = dout_ptr;
+      tmp_ptr = zero_ptr;
+      LEFT_PROCESS_MORE(tmp_ptr, tmp_ptr, dr0, doutr);
+      for (; w < win_new - 14; w += 12) {
+        MID_PROCESS_PAD_2(dr0, doutr)
+        TOP_MID_PAD_2
+      }
+      if (w < win_new) {
+        tmp_ptr = zero_ptr;
+        RIGHT_PROCESS(tmp_ptr, tmp_ptr, dr0, doutr)
+      }
+      h++;
+      dout_ptr += win_round;
+      // pad_h = 1
+      w = 0;
+      dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      doutr = dout_ptr;
+      tmp_ptr = zero_ptr;
+      LEFT_PROCESS_MORE(tmp_ptr, dr0, dr1, doutr);
+      // mid-cnt
+      if (hin >= 2) {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_1(dr0, dr1, doutr)
+          TOP_MID_PAD_1
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, dr1, doutr)
+        }
+      } else {
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_2(dr0, doutr)
+          TOP_MID_PAD_2
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, tmp_ptr, doutr)
+        }
+      }
+      h++;
+      dout_ptr += win_round;
+    }
+    // mid
+    for (; h < hin_round && h < hin_new - 2; h++) {
+      const int8_t* dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      const int8_t* dr2 = dr1 + win;
+      int8_t* doutr = dout_ptr;
+      int w = 0;
+      LEFT_PROCESS_MORE(dr0, dr1, dr2, doutr)
+      MID_PROCESS(dr0, dr1, dr2, doutr)
+      RIGHT_PROCESS(dr0, dr1, dr2, doutr)
+      din_ptr += win;
+      dout_ptr += win_round;
+    }
+    // bottom
+    if (h < hin_round) {  // bottom
+      const int8_t* dr0 = din_ptr;
+      const int8_t* dr1 = din_ptr + win;
+      int8_t* doutr = dout_ptr;
+      auto tmp_ptr0 = zero_ptr;
+      w = 0;
+      if (h == hin_new - 2) {
+        LEFT_PROCESS_MORE(tmp_ptr0, dr1, dr0, doutr)
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_1(dr0, dr1, doutr)
+          BOT_MID_PAD_1
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, dr1, doutr)
+        }
+      }
+      if (h == hin_new - 1) {
+        LEFT_PROCESS_MORE(tmp_ptr0, tmp_ptr0, dr0, doutr)
+        for (; w < win_new - 14; w += 12) {
+          MID_PROCESS_PAD_2(dr0, doutr)
+          BOT_MID_PAD_2
+        }
+        if (w < win_new) {
+          auto tmp_ptr = zero_ptr;
+          RIGHT_PROCESS(tmp_ptr, dr0, tmp_ptr, doutr)
+        }
+      }
+    }
+  }
+}
+template <typename Dtype>
+inline void store_data_dtype_8(Dtype* dout,
+                               __m256i vin,
+                               __m256 vscale,
+                               __m256 vbias);
+
+template <typename Dtype>
+inline void store_data_dtype_2(Dtype* dout,
+                               __m256i vin,
+                               __m256 vscale,
+                               __m256 vbias);
+
+template <typename Dtype>
+inline void store_data_dtype_1(Dtype* dout,
+                               __m128i vin,
+                               __m128 vscale,
+                               __m128 vbias);
+
+template <>
+inline void store_data_dtype_8(float* dout,
+                               __m256i vin,
+                               __m256 vscale,
+                               __m256 vbias) {
+  // int32 -> fp32
+  __m256 vout = lasx_cvti32_f32(vin);
+  // * scale + bias
+  __m256 vres = lasx_fmadd_f32(vout, vscale, vbias);
+  // a0b0c0d0a4b4c4d4 -> a0a4b0b4c0c4d0d4
+  __m128 vres_0 = lasx_extractf128_f32(vres, 0);
+  __m128 vres_1 = lasx_extractf128_f32(vres, 1);
+  // a0a4b0b4
+  lsx_storeu_f32(dout, lsx_unpacklo_f32(vres_0, vres_1));
+  // c0c4d0d4
+  lsx_storeu_f32(dout + 4, lsx_unpackhi_f32(vres_0, vres_1));
+}
+template <>
+inline void store_data_dtype_8(int8_t* dout,
+                               __m256i vin,
+                               __m256 vscale,
+                               __m256 vbias) {
+  __m128 vmax = lsx_set1_f32(-127);
+  // int32 -> fp32
+  __m256 vout = lasx_cvti32_f32(vin);
+  // * scale + bias
+  __m256 vres = lasx_fmadd_f32(vout, vscale, vbias);
+  // a0b0c0d0a4b4c4d4 -> a0a4b0b4c0c4d0d4
+  __m128 vres_0_0 = lasx_extractf128_f32(vres, 0);
+  __m128 vres_1_0 = lasx_extractf128_f32(vres, 1);
+  // -127
+  __m128 vres_0 = lsx_blendv_f32(vmax, vres_0_0, lsx_cmpgt_f32(vres_0_0, vmax));
+  __m128 vres_1 = lsx_blendv_f32(vmax, vres_1_0, lsx_cmpgt_f32(vres_1_0, vmax));
+  // a0a4b0b4
+  __m128 vout0 = lsx_unpacklo_f32(vres_0, vres_1);
+  // c0c4d0d4
+  __m128 vout1 = lsx_unpackhi_f32(vres_0, vres_1);
+  // fp32 -> int32
+  __m128i v0_i32 = lsx_cvtf32_i32(vout0);
+  __m128i v1_i32 = lsx_cvtf32_i32(vout1);
+  // int32 -> int16
+  __m128i v0_i16 = lsx_packs_i32(v0_i32, v0_i32);
+  __m128i v1_i16 = lsx_packs_i32(v1_i32, v1_i32);
+  // int16 -> int8
+  __m128i v0_i8 = lsx_packs_i16(v0_i16, v0_i16);
+  __m128i v1_i8 = lsx_packs_i16(v1_i16, v1_i16);
+  lsx_storel_i64(reinterpret_cast<__m128i*>(dout),
+                   lsx_unpacklo_i32(v0_i8, v1_i8));
+}
+template <>
+inline void store_data_dtype_2(float* dout,
+                               __m256i vin,
+                               __m256 vscale,
+                               __m256 vbias) {
+  // int32 -> fp32
+  __m256 vout = lasx_cvti32_f32(vin);
+  // * scale + bias
+  __m256 vres = lasx_fmadd_f32(vout, vscale, vbias);
+  // a0b0c0d0a4b4c4d4 -> a0a4b0b4c0c4d0d4
+  dout[0] = (reinterpret_cast<float*>(&vres))[0];
+  dout[1] = (reinterpret_cast<float*>(&vres))[4];
+}
+template <>
+inline void store_data_dtype_2(int8_t* dout,
+                               __m256i vin,
+                               __m256 vscale,
+                               __m256 vbias) {
+  // int32 -> fp32
+  __m256 vout = lasx_cvti32_f32(vin);
+  // * scale + bias
+  __m256 vres = lasx_fmadd_f32(vout, vscale, vbias);
+  // a0b0c0d0a4b4c4d4 -> a0a4b0b4c0c4d0d4
+  float v0 = (reinterpret_cast<float*>(&vres))[0];
+  float v1 = (reinterpret_cast<float*>(&vres))[4];
+  v0 = v0 > -127 ? v0 : -127;
+  v1 = v1 > -127 ? v1 : -127;
+  dout[0] = saturate_cast<int8_t>(v0);
+  dout[1] = saturate_cast<int8_t>(v1);
+}
+template <>
+inline void store_data_dtype_1(float* dout,
+                               __m128i vin,
+                               __m128 vscale,
+                               __m128 vbias) {
+  // int32 -> fp32
+  __m128 vout = lsx_cvti32_f32(vin);
+  // * scale + bias
+  __m128 vres = lsx_fmadd_f32(vout, vscale, vbias);
+  // a0b0c0d0a4b4c4d4 -> a0a4b0b4c0c4d0d4
+  dout[0] = (reinterpret_cast<float*>(&vres))[0];
+}
+template <>
+inline void store_data_dtype_1(int8_t* dout,
+                               __m128i vin,
+                               __m128 vscale,
+                               __m128 vbias) {
+  // int32 -> fp32
+  __m128 vout = lsx_cvti32_f32(vin);
+  // * scale + bias
+  __m128 vres = lsx_fmadd_f32(vout, vscale, vbias);
+  // a0b0c0d0a4b4c4d4 -> a0a4b0b4c0c4d0d4
+  float v0 = (reinterpret_cast<float*>(&vres))[0];
+  v0 = v0 > -127 ? v0 : -127;
+  dout[0] = saturate_cast<int8_t>(v0);
+}
+
+template <typename Dtype>
+void conv_3x3s1_dw_int8(Dtype* dout,
+                        const int8_t* din,
+                        const int8_t* weights,
+                        const float* bias,
+                        int num,
+                        int chin,
+                        int hin,
+                        int win,
+                        int hout,
+                        int wout,
+                        int pad_h,
+                        int pad_w,
+                        int flag_act,
+                        float alpha,
+                        const float* scale,
+                        LoongArchContext* ctx) {
+  // weights: [cout, 1, kh, kw]
+  // din: [num, chin, h, w] -> [num, chin, outh, outw, 9]
+  int size_in_channel = win * hin;
+  int size_out_channel = wout * hout;
+  const int win_round = wout * 16;
+  const int hin_round = hout + 2;
+
+  int w_stride = 9;  // kernel_w * kernel_h;
+  int omp_num = num * chin;
+  int pre_in_size = hin_round * win_round;
+  int cnt = wout >> 3;
+  int remain = wout % 8;
+  __m128i vmask = lsx_set_i8(
+      -127, -127, -127, -127, -127, 8, 7, 6, -127, 5, 4, 3, -127, 2, 1, 0);
+  __m128i vone = lsx_set1_i16(1);
+  __m256i vone_l = lasx_set1_i16(1);
+
+  int rem_cnt = remain >> 1;
+  int rem_rem = remain & 1;
+  bool flag_bias = bias ? true : false;
+  int8_t* pre_din = static_cast<int8_t*>(
+      TargetMalloc(TARGET(kLoongArch),
+                   std::max(pre_in_size * omp_num * sizeof(int8_t),
+                            32 * omp_num * sizeof(int8_t))));
+  // LOG(INFO) << "prepack_input_im2col_s1_int8: ";
+  // auto start = clock();
+  for (int n = 0; n < omp_num; ++n) {
+    const int8_t* din_batch = din + n * size_in_channel;
+    int8_t* out_ptr = pre_din + n * pre_in_size;
+    // im2col data [num, chin, h, w] -> [num, chin, outh, outw, 9 + 7(0)]
+    // int8 -> int16 -> +128 ->uint16 -> int8
+    prepack_input_im2col_s1_int8(
+        din_batch, out_ptr, pad_w, pad_h, win, hin, win_round, hout);
+  }
+// auto end = clock();
+// LOG(INFO) << "im2col duration: " << (end-start) * 1000.0 /CLOCKS_PER_SEC;
+// start = clock();
+#pragma omp parallel for
+  for (int n = 0; n < omp_num; ++n) {
+    int8_t* pre_din_ptr0 = pre_din + n * pre_in_size;
+    Dtype* dout_batch = dout + n * size_out_channel;
+    int now_c = n % chin;
+    float bias_val = flag_bias ? static_cast<const float>(bias[now_c]) : 0;
+    const int8_t* weight_ptr = weights + now_c * w_stride;
+    __m128 vscale = lsx_set1_f32(scale[now_c]);
+    __m128 vbias = lsx_set1_f32(bias_val);
+    __m256 vscale_l = lasx_set1_f32(scale[now_c]);
+    __m256 vbias_l = lasx_set1_f32(bias_val);
+    // w00w01w02w10w11w12w20w21w22w00w01w02..
+    __m128i weight_val =
+        lsx_loadu_m128i(reinterpret_cast<__m128i const*>(weight_ptr));
+    // set - w00w01w02-0-w10w11w12-0-w20w21w22-0-0000
+    __m128i vw_temp = lsx_shuffle_i8(weight_val, vmask);
+    __m256i vw = lasx_broadcastm128i_m256i(vw_temp);
+    for (int h = 0; h < hout; h++) {
+      int8_t* pre_din_ptr = pre_din_ptr0;
+      Dtype* dout_ptr = dout_batch;
+      for (int w = 0; w < cnt; w++) {
+        __m256i vin0 =
+            lasx_loadu_m256i(reinterpret_cast<__m256i const*>(pre_din_ptr));
+        __m256i vin1 = lasx_loadu_m256i(
+            reinterpret_cast<__m256i const*>(pre_din_ptr + 32));
+        __m256i vin2 = lasx_loadu_m256i(
+            reinterpret_cast<__m256i const*>(pre_din_ptr + 64));
+        __m256i vin3 = lasx_loadu_m256i(
+            reinterpret_cast<__m256i const*>(pre_din_ptr + 96));
+        __m256i vout0 = lasx_set1_i32(0);
+        __m256i vout1 = lasx_set1_i32(0);
+        __m256i vout2 = lasx_set1_i32(0);
+        __m256i vout3 = lasx_set1_i32(0);
+        // u8 * s8 = s16
+        __m256i vsum0 = lasx_maddubs_i16(vin0, vw);
+        __m256i vsum1 = lasx_maddubs_i16(vin1, vw);
+        __m256i vsum2 = lasx_maddubs_i16(vin2, vw);
+        __m256i vsum3 = lasx_maddubs_i16(vin3, vw);
+        // s16 * s16 = s32
+        vout0 = lasx_madd_i16(vsum0, vone_l);
+        vout1 = lasx_madd_i16(vsum1, vone_l);
+        vout2 = lasx_madd_i16(vsum2, vone_l);
+        vout3 = lasx_madd_i16(vsum3, vone_l);
+        // a0a2b0b2a4a6b4b6
+        __m256i vres0 = lasx_hadd_i32(vout0, vout1);
+        // c0c2d0d2c4c6d4d6
+        __m256i vres1 = lasx_hadd_i32(vout2, vout3);
+        // a0b0c0d0a4b4c4d4
+        __m256i vres = lasx_hadd_i32(vres0, vres1);
+        store_data_dtype_8<Dtype>(dout_ptr, vres, vscale_l, vbias_l);
+        dout_ptr += 8;
+        pre_din_ptr += 128;
+      }
+      for (int w = 0; w < rem_cnt; w++) {
+        __m256i vin0 =
+            lasx_loadu_m256i(reinterpret_cast<__m256i const*>(pre_din_ptr));
+        __m256i vout0 = lasx_set1_i32(0);
+        // u8 * s8 = s16
+        __m256i vsum0 = lasx_maddubs_i16(vin0, vw);
+        // s16 * s16 = s32
+        vout0 = lasx_madd_i16(vsum0, vone_l);
+        // a0a2b0b2a4a6b4b6
+        __m256i vres0 = lasx_hadd_i32(vout0, vout0);
+        // a0b0c0d0a4b4c4d4
+        __m256i vres = lasx_hadd_i32(vres0, vres0);
+        store_data_dtype_2<Dtype>(dout_ptr, vres, vscale_l, vbias_l);
+        dout_ptr += 2;
+        pre_din_ptr += 32;
+      }
+      if (rem_rem > 0) {
+        __m128i vin0 =
+            lsx_loadu_m128i(reinterpret_cast<__m128i const*>(pre_din_ptr));
+        __m128i vout0 = lsx_set1_i32(0);
+        // u8 * s8 = s16
+        __m128i vsum0 = lsx_maddubs_i16(vin0, vw_temp);
+        // s16 * s16 = s32
+        vout0 = lsx_madd_i16(vsum0, vone);
+        // a0a2b0b2
+        __m128i vres0 = lsx_hadd_i32(vout0, vout0);
+        // a0b0c0d0
+        __m128i vres = lsx_hadd_i32(vres0, vres0);
+        store_data_dtype_1(dout_ptr, vres, vscale, vbias);
+      }
+      pre_din_ptr0 += win_round;
+      dout_batch += wout;
+    }
+  }
+  // end = clock();
+  // LOG(INFO) << "compute duration: " << (end-start) * 1000.0 /CLOCKS_PER_SEC;
+  TargetFree(TARGET(kLoongArch), pre_din);
+}
+template void conv_3x3s1_dw_int8(float* dout,
+                                 const int8_t* din,
+                                 const int8_t* weights,
+                                 const float* bias,
+                                 int num,
+                                 int chin,
+                                 int hin,
+                                 int win,
+                                 int hout,
+                                 int wout,
+                                 int pad_h,
+                                 int pad_w,
+                                 int flag_act,
+                                 float alpha,
+                                 const float* scale,
+                                 LoongArchContext* ctx);
+template void conv_3x3s1_dw_int8(int8_t* dout,
+                                 const int8_t* din,
+                                 const int8_t* weights,
+                                 const float* bias,
+                                 int num,
+                                 int chin,
+                                 int hin,
+                                 int win,
+                                 int hout,
+                                 int wout,
+                                 int pad_h,
+                                 int pad_w,
+                                 int flag_act,
+                                 float alpha,
+                                 const float* scale,
+                                 LoongArchContext* ctx);
+#undef MID_PROCESS_PAD_2
+#undef TOP_MID_PAD_2
+#undef BOT_MID_PAD_2
+#undef MID_PROCESS_PAD_1
+#undef TOP_MID_PAD_1
+#undef BOT_MID_PAD_1
+#undef MID_PROCESS
+#undef LEFT_PROCESS_MORE
+#undef LEFT_PROCESS
+#undef RIGHT_PROCESS
+#undef DATA_PACK
+#undef ROUNDUP
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
+
+#endif // __loongarch_asx
diff --git a/lite/backends/loongarch/math/conv3x3s2_dephtwise_int8.cc b/lite/backends/loongarch/math/conv3x3s2_dephtwise_int8.cc
new file mode 100644
index 00000000000..86f3f072751
--- /dev/null
+++ b/lite/backends/loongarch/math/conv3x3s2_dephtwise_int8.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "lite/backends/loongarch/math/conv_depthwise_int8.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename Dtype>
+void conv_3x3s2p0_dw_int8(Dtype* dout,
+                          const int8_t* din,
+                          const int8_t* weights,
+                          const float* bias,
+                          int num,
+                          int chin,
+                          int hin,
+                          int win,
+                          int hout,
+                          int wout,
+                          int pad_h,
+                          int pad_w,
+                          int flag_act,
+                          float alpha,
+                          const float* scale,
+                          LoongArchContext* ctx) {}
+
+template <typename Dtype>
+void conv_3x3s2p1_dw_int8(Dtype* dout,
+                          const int8_t* din,
+                          const int8_t* weights,
+                          const float* bias,
+                          int num,
+                          int chin,
+                          int hin,
+                          int win,
+                          int hout,
+                          int wout,
+                          int pad_h,
+                          int pad_w,
+                          int flag_act,
+                          float alpha,
+                          const float* scale,
+                          LoongArchContext* ctx) {}
+
+template void conv_3x3s2p0_dw_int8(float* dout,
+                                   const int8_t* din,
+                                   const int8_t* weights,
+                                   const float* bias,
+                                   int num,
+                                   int chin,
+                                   int hin,
+                                   int win,
+                                   int hout,
+                                   int wout,
+                                   int pad_h,
+                                   int pad_w,
+                                   int flag_act,
+                                   float alpha,
+                                   const float* scale,
+                                   LoongArchContext* ctx);
+
+template void conv_3x3s2p0_dw_int8(int8_t* dout,
+                                   const int8_t* din,
+                                   const int8_t* weights,
+                                   const float* bias,
+                                   int num,
+                                   int chin,
+                                   int hin,
+                                   int win,
+                                   int hout,
+                                   int wout,
+                                   int pad_h,
+                                   int pad_w,
+                                   int flag_act,
+                                   float alpha,
+                                   const float* scale,
+                                   LoongArchContext* ctx);
+
+template void conv_3x3s2p1_dw_int8(float* dout,
+                                   const int8_t* din,
+                                   const int8_t* weights,
+                                   const float* bias,
+                                   int num,
+                                   int chin,
+                                   int hin,
+                                   int win,
+                                   int hout,
+                                   int wout,
+                                   int pad_h,
+                                   int pad_w,
+                                   int flag_act,
+                                   float alpha,
+                                   const float* scale,
+                                   LoongArchContext* ctx);
+
+template void conv_3x3s2p1_dw_int8(int8_t* dout,
+                                   const int8_t* din,
+                                   const int8_t* weights,
+                                   const float* bias,
+                                   int num,
+                                   int chin,
+                                   int hin,
+                                   int win,
+                                   int hout,
+                                   int wout,
+                                   int pad_h,
+                                   int pad_w,
+                                   int flag_act,
+                                   float alpha,
+                                   const float* scale,
+                                   LoongArchContext* ctx);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv_bias.cc b/lite/backends/loongarch/math/conv_bias.cc
new file mode 100644
index 00000000000..6254c699c6d
--- /dev/null
+++ b/lite/backends/loongarch/math/conv_bias.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/conv_bias.h"
+#include <algorithm>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void bias_add_broadcast(const float* dinx,
+                        const float* diny,
+                        float* dout,
+                        int batch,
+                        int channels,
+                        int num) {
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const float* din_ptr = dinx + offset;
+      const float diny_data = diny[j];
+      float* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = *din_ptr + diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
+void bias_add_relu_broadcast(const float* dinx,
+                             const float* diny,
+                             float* dout,
+                             int batch,
+                             int channels,
+                             int num) {
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const float* din_ptr = dinx + offset;
+      const float diny_data = diny[j];
+      float* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = (std::max)(0.f, *din_ptr + diny_data);
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
+void bias_add_relu6_broadcast(const float* dinx,
+                              const float* diny,
+                              float* dout,
+                              int batch,
+                              int channels,
+                              int num) {
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const float* din_ptr = dinx + offset;
+      const float diny_data = diny[j];
+      float* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = (std::min)(6.f, (std::max)(0.f, *din_ptr + diny_data));
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv_bias.h b/lite/backends/loongarch/math/conv_bias.h
new file mode 100644
index 00000000000..423f3ce1b79
--- /dev/null
+++ b/lite/backends/loongarch/math/conv_bias.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void bias_add_broadcast(const float* dinx,
+                        const float* diny,
+                        float* dout,
+                        int batch,
+                        int channels,
+                        int num);
+
+void bias_add_relu_broadcast(const float* dinx,
+                             const float* diny,
+                             float* dout,
+                             int batch,
+                             int channels,
+                             int num);
+
+void bias_add_relu6_broadcast(const float* dinx,
+                              const float* diny,
+                              float* dout,
+                              int batch,
+                              int channels,
+                              int num);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv_depthwise_3x3.cc b/lite/backends/loongarch/math/conv_depthwise_3x3.cc
new file mode 100644
index 00000000000..61842c04ad1
--- /dev/null
+++ b/lite/backends/loongarch/math/conv_depthwise_3x3.cc
@@ -0,0 +1,1463 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+#include "lite/backends/loongarch/math/lsx/conv_depthwise_pack4.h"
+#include "lite/backends/loongarch/math/lasx/conv_depthwise_pack8.h"
+#include "lite/backends/loongarch/math/common/conv_utils.h"
+#include "lite/backends/loongarch/math/conv_depthwise_impl.h"
+#include "lite/core/memory.h"
+#include "lite/backends/loongarch/xxl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+#define Max(a, b) (a > b ? a : b)
+
+void conv_depthwise_3x3s2_p01_direct(
+    const float *din,
+    float *dout,
+    int num,
+    int ch_out,
+    int h_out,
+    int w_out,
+    int ch_in,
+    int h_in,
+    int w_in,
+    const float *weights,
+    const float *bias,
+    int pad,
+    bool flag_bias,
+    const operators::ActivationParam act_param) {
+#ifdef __loongarch_asx
+
+  bool right = false;  // for right result
+
+  bool has_active = act_param.has_active;
+  auto act_type = act_param.active_type;
+
+  float *zero_ptr = static_cast<float *>(
+      TargetMalloc(TARGET(kLoongArch), Max(w_in * sizeof(float), 8 * sizeof(float))));
+  memset(zero_ptr, 0, Max(w_in * sizeof(float), 8 * sizeof(float)));
+  float *write_ptr =
+      static_cast<float *>(TargetMalloc(TARGET(kLoongArch), w_out * sizeof(float)));
+
+  //! prepare for processing right result
+  int rmask_o[4] = {0};
+  float rmaskr[8] = {-1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f};
+  int ro = w_out % 3;
+  int col = w_out / 3;
+  if (ro > 0) col++;
+  if (ro > 0) {
+    for (int i = 0; i < 4; i++) {
+      if (i < ro) {
+        rmask_o[i] = 0x80000000;
+      }
+    }
+    right = true;
+  }
+  int ri = (w_in - (1 - pad)) % 6;
+  // [pad == 0 && w_out == 3 && win == 8] ===>>> [ri == 1 && ro == 0]
+  // add condition ro > 0 for avoiding wrong rmaskr when pad == 0
+  if (ri > 0 && (ro > 0 || pad == 1)) {
+    for (int i = 0; i < 8; i++) {
+      if (i <= ri) {
+        rmaskr[i] = -1.f;
+      } else {
+        rmaskr[i] = 1.f;
+      }
+    }
+  }
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  __m128 zero = lsx_set1_f32(0.f);
+  __m256 zero_256 = lasx_set1_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      __m256 v_bias = lasx_set1_f32(bias_val);
+      const float *wei_ptr = weights + c * w_stride;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+
+      // for shift input
+      __m256i shift_0 = lasx_set_i32(7, 7, 6, 5, 4, 3, 2, 1);
+      __m256i shift_1 = lasx_set_i32(7, 7, 7, 6, 5, 4, 3, 2);
+      __m256i shift_3 = lasx_set_i32(6, 5, 4, 3, 2, 1, 0, 7);
+
+      for (int i = 0; i + (1 - pad) < h_in; i += 4) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+
+        //! process top pad
+        if (i == 0 && pad == 1) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          din_ptr4 = dr3;
+          dr0 = dr3;
+          dr1 = dr0 + w_in;
+        } else {
+          dr0 = dr4;
+          dr1 = dr0 + w_in;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i + 4 + (1 - pad) > h_in) {
+          switch (i + 4 + (1 - pad) - h_in) {
+            case 4:
+              din_ptr1 = zero_ptr;
+            case 3:
+              din_ptr2 = zero_ptr;
+            case 2:
+              din_ptr3 = zero_ptr;
+            case 1:
+              din_ptr4 = zero_ptr;
+            default:
+              break;
+          }
+        }
+
+        //! process bottom remain
+        if (i / 2 + 2 > h_out) {
+          switch (i / 2 + 2 - h_out) {
+            case 2:
+              doutr0 = write_ptr;
+            case 1:
+              doutr1 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        for (int j = 0; j < col; j += 1) {
+          __m256 i0 = lasx_loadu_f32(din_ptr0);
+          __m256 i2 = lasx_loadu_f32(din_ptr2);
+          __m256 i1 = lasx_loadu_f32(din_ptr1);
+          __m256 i3 = lasx_loadu_f32(din_ptr3);
+          __m256 i4 = lasx_loadu_f32(din_ptr4);
+
+          //! process left pad
+          if (j == 0 && pad == 1) {
+            din_ptr0 += 5;
+            din_ptr1 += 5;
+            din_ptr2 += 5;
+            din_ptr3 += 5;
+            din_ptr4 += 5;
+            i0 = lasx_blend_f32(zero_256, i0, 0b01111111);
+            i0 = lasx_permutevar8x32_f32(i0, shift_3);
+            i1 = lasx_blend_f32(zero_256, i1, 0b01111111);
+            i1 = lasx_permutevar8x32_f32(i1, shift_3);
+            i2 = lasx_blend_f32(zero_256, i2, 0b01111111);
+            i2 = lasx_permutevar8x32_f32(i2, shift_3);
+            i3 = lasx_blend_f32(zero_256, i3, 0b01111111);
+            i3 = lasx_permutevar8x32_f32(i3, shift_3);
+            i4 = lasx_blend_f32(zero_256, i4, 0b01111111);
+            i4 = lasx_permutevar8x32_f32(i4, shift_3);
+          } else {
+            din_ptr0 += 6;
+            din_ptr1 += 6;
+            din_ptr2 += 6;
+            din_ptr3 += 6;
+            din_ptr4 += 6;
+          }
+
+          //! process right remain
+          __m128i mask = lsx_setr_i32(0x80000000, 0x80000000, 0x80000000, 0);
+          if (j + 1 == col) {
+            __m256 rmask_ri = lasx_loadu_f32(rmaskr);
+            i0 = lasx_blendv_f32(zero_256, i0, rmask_ri);
+            i1 = lasx_blendv_f32(zero_256, i1, rmask_ri);
+            i2 = lasx_blendv_f32(zero_256, i2, rmask_ri);
+            i3 = lasx_blendv_f32(zero_256, i3, rmask_ri);
+            i4 = lasx_blendv_f32(zero_256, i4, rmask_ri);
+            dout_ptr = dout_ptr + 2 * w_out;
+            if (right) {
+              mask = lsx_setr_i32(
+                  rmask_o[0], rmask_o[1], rmask_o[2], rmask_o[3]);
+            }
+          }
+
+          __m256 wei_00 = lasx_set1_f32(*(wei_ptr));
+          __m256 wei_01 = lasx_set1_f32(*(wei_ptr + 1));
+          __m256 wei_02 = lasx_set1_f32(*(wei_ptr + 2));
+
+          // r0 row0
+          __m256 res0 = lasx_fmadd_f32(i0, wei_00, v_bias);
+          __m256 tmp = lasx_permutevar8x32_f32(i0, shift_0);
+          res0 = lasx_fmadd_f32(tmp, wei_01, res0);
+          tmp = lasx_permutevar8x32_f32(i0, shift_1);
+          res0 = lasx_fmadd_f32(tmp, wei_02, res0);
+
+          // r1 row0
+          __m256 res1 = lasx_fmadd_f32(i2, wei_00, v_bias);
+          tmp = lasx_permutevar8x32_f32(i2, shift_0);
+          res1 = lasx_fmadd_f32(tmp, wei_01, res1);
+          tmp = lasx_permutevar8x32_f32(i2, shift_1);
+          res1 = lasx_fmadd_f32(tmp, wei_02, res1);
+
+          __m256 wei_10 = lasx_set1_f32(*(wei_ptr + 3));
+          __m256 wei_11 = lasx_set1_f32(*(wei_ptr + 4));
+          __m256 wei_12 = lasx_set1_f32(*(wei_ptr + 5));
+
+          // r0 row0 + row1
+          res0 = lasx_fmadd_f32(i1, wei_10, res0);
+          tmp = lasx_permutevar8x32_f32(i1, shift_0);
+          res0 = lasx_fmadd_f32(tmp, wei_11, res0);
+          tmp = lasx_permutevar8x32_f32(i1, shift_1);
+          res0 = lasx_fmadd_f32(tmp, wei_12, res0);
+
+          // r1 row0 + row1
+          res1 = lasx_fmadd_f32(i3, wei_10, res1);
+          tmp = lasx_permutevar8x32_f32(i3, shift_0);
+          res1 = lasx_fmadd_f32(tmp, wei_11, res1);
+          tmp = lasx_permutevar8x32_f32(i3, shift_1);
+          res1 = lasx_fmadd_f32(tmp, wei_12, res1);
+
+          __m256 wei_20 = lasx_set1_f32(*(wei_ptr + 6));
+          __m256 wei_21 = lasx_set1_f32(*(wei_ptr + 7));
+          __m256 wei_22 = lasx_set1_f32(*(wei_ptr + 8));
+
+          // r0 row0 + row1 + row2
+          res0 = lasx_fmadd_f32(i2, wei_20, res0);
+          tmp = lasx_permutevar8x32_f32(i2, shift_0);
+          res0 = lasx_fmadd_f32(tmp, wei_21, res0);
+          tmp = lasx_permutevar8x32_f32(i2, shift_1);
+          res0 = lasx_fmadd_f32(tmp, wei_22, res0);
+
+          // r1 row0 + row1 + row2
+          res1 = lasx_fmadd_f32(i4, wei_20, res1);
+          tmp = lasx_permutevar8x32_f32(i4, shift_0);
+          res1 = lasx_fmadd_f32(tmp, wei_21, res1);
+          tmp = lasx_permutevar8x32_f32(i4, shift_1);
+          res1 = lasx_fmadd_f32(tmp, wei_22, res1);
+
+          __m256i shift_2 = lasx_set_i32(6, 4, 2, 0, 6, 4, 2, 0);
+          __m256 r0 = lasx_permutevar8x32_f32(res0, shift_2);
+          __m128 r0_128 = lasx_extractf128_f32(r0, 0);
+
+          __m256 r1 = lasx_permutevar8x32_f32(res1, shift_2);
+          __m128 r1_128 = lasx_extractf128_f32(r1, 0);
+
+          if (has_active) {  // process activation
+            if (act_type == lite_api::ActivationType::kRelu) {
+              r0_128 = lsx_max_f32(r0_128, zero);
+              r1_128 = lsx_max_f32(r1_128, zero);
+            } else if (act_type == lite_api::ActivationType::kRelu6) {
+              __m128 six = lsx_set1_f32(6.f);
+              r0_128 = lsx_min_f32(lsx_max_f32(r0_128, zero), six);
+              r1_128 = lsx_min_f32(lsx_max_f32(r1_128, zero), six);
+            } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+              __m128 negative_slope = lsx_set1_f32(act_param.Leaky_relu_alpha);
+              r0_128 = lsx_add_f32(
+                  lsx_and_f32(lsx_cmple_f32(zero, r0_128), r0_128),
+                  lsx_mul_f32(lsx_and_f32(lsx_cmplt_f32(r0_128, zero), r0_128),
+                             negative_slope));
+              r1_128 = lsx_add_f32(
+                  lsx_and_f32(lsx_cmple_f32(zero, r1_128), r1_128),
+                  lsx_mul_f32(lsx_and_f32(lsx_cmplt_f32(r1_128, zero), r1_128),
+                             negative_slope));
+            } else if (act_type == lite_api::ActivationType::kHardSwish) {
+              __m128 vscale = lsx_set1_f32(1.0 / act_param.hard_swish_scale);
+              __m128 voffset = lsx_set1_f32(act_param.hard_swish_offset);
+              __m128 vthreshold = lsx_set1_f32(act_param.hard_swish_threshold);
+              r0_128 = lsx_mul_f32(
+                  lsx_min_f32(vthreshold,
+                             lsx_max_f32(zero, lsx_add_f32(r0_128, voffset))),
+                  lsx_mul_f32(r0_128, vscale));
+              r1_128 = lsx_mul_f32(
+                  lsx_min_f32(vthreshold,
+                             lsx_max_f32(zero, lsx_add_f32(r1_128, voffset))),
+                  lsx_mul_f32(r1_128, vscale));
+            } else {
+              LOG(FATAL) << "[LoongArch] activation type: "
+                         << static_cast<int>(act_type) << "not supported";
+            }
+          }
+          lsx_maskstore_f32(doutr0, mask, r0_128);
+          lsx_maskstore_f32(doutr1, mask, r1_128);
+
+          doutr0 = doutr0 + 3;
+          doutr1 = doutr1 + 3;
+        }
+      }
+    }
+  }
+  TargetFree(TARGET(kLoongArch), zero_ptr);
+  TargetFree(TARGET(kLoongArch), write_ptr);
+#else
+  bool right = false;  // for right result
+
+  bool has_active = act_param.has_active;
+  auto act_type = act_param.active_type;
+
+  float *zero_ptr = static_cast<float *>(TargetMalloc(
+      TARGET(kLoongArch), Max(w_in * sizeof(float), 12 * sizeof(float))));
+  memset(zero_ptr, 0, Max(w_in * sizeof(float), 12 * sizeof(float)));
+  float *write_ptr =
+      static_cast<float *>(TargetMalloc(TARGET(kLoongArch), w_out * sizeof(float)));
+
+  //! prepare for processing right result
+  float rmasko[4] = {1.f, 1.f, 1.f, 1.f};
+  float rmaskr[12] = {
+      -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f};
+  int ro = w_out % 4;
+  int col = w_out / 4;
+  if (ro > 0) col++;
+  if (ro > 0) {
+    for (int i = 0; i < 4; i++) {
+      if (i < ro) {
+        rmasko[i] = -1.f;
+      }
+    }
+    right = true;
+  }
+  int ri = (w_in - (1 - pad)) % 8;
+  if (ri > 0 && (ro > 0 || pad == 1)) {
+    for (int i = 0; i < 12; i++) {
+      if (i <= ri) {
+        rmaskr[i] = -1.f;
+      } else {
+        rmaskr[i] = 1.f;
+      }
+    }
+  }
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  __m128 zero = lsx_set1_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      __m128 v_bias = lsx_set1_f32(bias_val);
+      const float *wei_ptr = weights + c * w_stride;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+
+      float *doutr0 = dout_ptr;
+      float *doutr0_ptr = doutr0;
+
+      for (int i = 0; i + (1 - pad) < h_in; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+
+        doutr0_ptr = doutr0;
+
+        //! process top pad
+        if (i == 0 && pad == 1) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr1 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr0 + w_in;
+          dr2 = dr1 + w_in;
+        }
+
+        //! process bottom pad
+        if (i + 2 + (1 - pad) > h_in) {
+          switch (i + 2 + (1 - pad) - h_in) {
+            case 2:
+              din_ptr1 = zero_ptr;
+            case 1:
+              din_ptr2 = zero_ptr;
+            default:
+              break;
+          }
+        }
+
+        if (i / 2 + 1 > h_out) {
+          doutr0_ptr = write_ptr;
+        }
+
+        for (int j = 0; j < col; j++) {
+          __m128 i0_0 = lsx_loadu_f32(din_ptr0);
+          __m128 i0_1 = lsx_loadu_f32(din_ptr0 + 4);
+          __m128 i0_2 = lsx_loadu_f32(din_ptr0 + 8);
+
+          __m128 i1_0 = lsx_loadu_f32(din_ptr1);
+          __m128 i1_1 = lsx_loadu_f32(din_ptr1 + 4);
+          __m128 i1_2 = lsx_loadu_f32(din_ptr1 + 8);
+
+          __m128 i2_0 = lsx_loadu_f32(din_ptr2);
+          __m128 i2_1 = lsx_loadu_f32(din_ptr2 + 4);
+          __m128 i2_2 = lsx_loadu_f32(din_ptr2 + 8);
+
+          //! process left pad
+          if (j == 0 && pad == 1) {
+            __m128 tmp0 = lsx_blend_f32(zero, i0_0, 0b0111);
+            tmp0 = lsx_shuffle_f32(tmp0, tmp0, 0b10010011);
+            __m128 tmp1 = lsx_blend_f32(i0_0, i0_1, 0b0111);
+            tmp1 = lsx_shuffle_f32(tmp1, tmp1, 0b10010011);
+            i0_2 = lsx_blend_f32(i0_1, i0_2, 0b0111);
+            i0_2 = lsx_shuffle_f32(i0_2, i0_2, 0b10010011);
+            i0_0 = tmp0;
+            i0_1 = tmp1;
+
+            tmp0 = lsx_blend_f32(zero, i1_0, 0b0111);
+            tmp0 = lsx_shuffle_f32(tmp0, tmp0, 0b10010011);
+            tmp1 = lsx_blend_f32(i1_0, i1_1, 0b0111);
+            tmp1 = lsx_shuffle_f32(tmp1, tmp1, 0b10010011);
+            i1_2 = lsx_blend_f32(i1_1, i1_2, 0b0111);
+            i1_2 = lsx_shuffle_f32(i1_2, i1_2, 0b10010011);
+            i1_0 = tmp0;
+            i1_1 = tmp1;
+
+            tmp0 = lsx_blend_f32(zero, i2_0, 0b0111);
+            tmp0 = lsx_shuffle_f32(tmp0, tmp0, 0b10010011);
+            tmp1 = lsx_blend_f32(i2_0, i2_1, 0b0111);
+            tmp1 = lsx_shuffle_f32(tmp1, tmp1, 0b10010011);
+            i2_2 = lsx_blend_f32(i2_1, i2_2, 0b0111);
+            i2_2 = lsx_shuffle_f32(i2_2, i2_2, 0b10010011);
+            i2_0 = tmp0;
+            i2_1 = tmp1;
+
+            din_ptr0 += 7;
+            din_ptr1 += 7;
+            din_ptr2 += 7;
+          } else {
+            din_ptr0 += 8;
+            din_ptr1 += 8;
+            din_ptr2 += 8;
+          }
+
+          //! process right remain
+          if (j + 1 == col) {
+            doutr0 = doutr0 + w_out;
+            __m128 rmask = lsx_loadu_f32(rmaskr);
+            i0_0 = lsx_blendv_f32(zero, i0_0, rmask);
+            i1_0 = lsx_blendv_f32(zero, i1_0, rmask);
+            i2_0 = lsx_blendv_f32(zero, i2_0, rmask);
+
+            rmask = lsx_loadu_f32(rmaskr + 4);
+            i0_1 = lsx_blendv_f32(zero, i0_1, rmask);
+            i1_1 = lsx_blendv_f32(zero, i1_1, rmask);
+            i2_1 = lsx_blendv_f32(zero, i2_1, rmask);
+
+            rmask = lsx_loadu_f32(rmaskr + 8);
+            i0_2 = lsx_blendv_f32(zero, i0_2, rmask);
+            i1_2 = lsx_blendv_f32(zero, i1_2, rmask);
+            i2_2 = lsx_blendv_f32(zero, i2_2, rmask);
+          }
+          //！ shift input
+          // 0,1,2,3  4,5,6,7  8,9,10,11 => 0,1,2,3  2,3,4,5  3,4,5,6
+          __m128 tmp = lsx_shuffle_f32(i0_0, i0_1, 0b10001000);
+          i0_1 = lsx_shuffle_f32(i0_0, i0_1, 0b11011101);
+          i0_0 = tmp;
+          i0_2 = lsx_blend_f32(i0_2, i0_0, 0b1110);
+          i0_2 = lsx_shuffle_f32(i0_2, i0_2, 0b00111001);
+
+          tmp = lsx_shuffle_f32(i1_0, i1_1, 0b10001000);
+          i1_1 = lsx_shuffle_f32(i1_0, i1_1, 0b11011101);
+          i1_0 = tmp;
+          i1_2 = lsx_blend_f32(i1_2, i1_0, 0b1110);
+          i1_2 = lsx_shuffle_f32(i1_2, i1_2, 0b00111001);
+
+          tmp = lsx_shuffle_f32(i2_0, i2_1, 0b10001000);
+          i2_1 = lsx_shuffle_f32(i2_0, i2_1, 0b11011101);
+          i2_0 = tmp;
+          i2_2 = lsx_blend_f32(i2_2, i2_0, 0b1110);
+          i2_2 = lsx_shuffle_f32(i2_2, i2_2, 0b00111001);
+
+          __m128 wei_00 = lsx_load1_f32(wei_ptr);
+          __m128 wei_01 = lsx_load1_f32(wei_ptr + 1);
+          __m128 wei_02 = lsx_load1_f32(wei_ptr + 2);
+
+          // r0 row0
+          __m128 r0 = lsx_mul_f32(i0_0, wei_00);
+          r0 = lsx_add_f32(r0, v_bias);
+          tmp = lsx_mul_f32(i0_1, wei_01);
+          r0 = lsx_add_f32(r0, tmp);
+          tmp = lsx_mul_f32(i0_2, wei_02);
+          r0 = lsx_add_f32(r0, tmp);
+
+          __m128 wei_10 = lsx_load1_f32(wei_ptr + 3);
+          __m128 wei_11 = lsx_load1_f32(wei_ptr + 4);
+          __m128 wei_12 = lsx_load1_f32(wei_ptr + 5);
+
+          // r0 row0 + row1
+          tmp = lsx_mul_f32(i1_0, wei_10);
+          r0 = lsx_add_f32(r0, tmp);
+          tmp = lsx_mul_f32(i1_1, wei_11);
+          r0 = lsx_add_f32(r0, tmp);
+          tmp = lsx_mul_f32(i1_2, wei_12);
+          r0 = lsx_add_f32(r0, tmp);
+
+          __m128 wei_20 = lsx_load1_f32(wei_ptr + 6);
+          __m128 wei_21 = lsx_load1_f32(wei_ptr + 7);
+          __m128 wei_22 = lsx_load1_f32(wei_ptr + 8);
+
+          // r0 row0 + row1 + row2
+          tmp = lsx_mul_f32(i2_0, wei_20);
+          r0 = lsx_add_f32(r0, tmp);
+          tmp = lsx_mul_f32(i2_1, wei_21);
+          r0 = lsx_add_f32(r0, tmp);
+          tmp = lsx_mul_f32(i2_2, wei_22);
+          r0 = lsx_add_f32(r0, tmp);
+
+          if (has_active) {  // process activation
+            if (act_type == lite_api::ActivationType::kRelu) {
+              r0 = lsx_max_f32(r0, zero);
+            } else if (act_type == lite_api::ActivationType::kRelu6) {
+              __m128 six = lsx_set1_f32(6.f);
+              r0 = lsx_min_f32(lsx_max_f32(r0, zero), six);
+            } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+              __m128 negative_slope = lsx_set1_f32(act_param.Leaky_relu_alpha);
+              r0 = lsx_add_f32(lsx_and_f32(lsx_cmple_f32(zero, r0), r0),
+                              lsx_mul_f32(lsx_and_f32(lsx_cmplt_f32(r0, zero), r0),
+                                         negative_slope));
+            } else if (act_type == lite_api::ActivationType::kHardSwish) {
+              r0 = lsx_mul_f32(
+                  lsx_min_f32(
+                      lsx_set1_f32(act_param.hard_swish_threshold),
+                      lsx_max_f32(
+                          zero,
+                          lsx_add_f32(
+                              r0, lsx_set1_f32(act_param.hard_swish_offset)))),
+                  lsx_mul_f32(r0,
+                             lsx_set1_f32(1.0 / act_param.hard_swish_scale)));
+            } else {
+              LOG(FATAL) << "[LoongArch] activation type: "
+                         << static_cast<int>(act_type) << "not supported";
+            }
+          }
+
+          //! process bottom pad
+          if (j + 1 == col && right) {
+            __m128 out0 = lsx_loadu_f32(doutr0_ptr);
+            __m128 rmask_ro = lsx_loadu_f32(rmasko);
+            r0 = lsx_blendv_f32(out0, r0, rmask_ro);
+          }
+
+          lsx_storeu_f32(doutr0_ptr, r0);
+
+          doutr0_ptr += 4;
+        }
+      }
+    }
+  }
+  TargetFree(TARGET(kLoongArch), zero_ptr);
+  TargetFree(TARGET(kLoongArch), write_ptr);
+#endif
+}
+void conv_depthwise_3x3s1_p01_direct(
+    const float *din,
+    float *dout,
+    int num,
+    int ch_out,
+    int h_out,
+    int w_out,
+    int ch_in,
+    int h_in,
+    int w_in,
+    const float *weights,
+    const float *bias,
+    int pad,
+    bool flag_bias,
+    const operators::ActivationParam act_param) {
+#ifdef __loongarch_asx
+  bool right = false;
+
+  bool has_active = act_param.has_active;
+  auto act_type = act_param.active_type;
+
+  float *zero_ptr = static_cast<float *>(
+      TargetMalloc(TARGET(kLoongArch), Max(w_in * sizeof(float), 8)));
+  memset(zero_ptr, 0, Max(w_in * sizeof(float), 8));
+  float *write_ptr =
+      static_cast<float *>(TargetMalloc(TARGET(kLoongArch), w_out * sizeof(float)));
+
+  //! prepare for processing right result
+  int rmask_o[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  float rmaskr[8] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f};
+  int r = w_out % 6;
+  int col = w_out / 6;
+  if (r > 0) col++;
+  if (r > 0) {
+    for (int i = 0; i < 8; i++) {
+      if (i < r) {
+        rmask_o[i] = 0x80000000;
+      }
+      if (i <= r + (1 - pad)) {
+        rmaskr[i] = -1.f;
+      }
+    }
+    right = true;
+  } else {
+    for (int i = 0; i < 7 + (1 - pad); i++) {
+      rmaskr[i] = -1.f;
+    }
+  }
+
+  __m256i shift_1 = lasx_set_i32(7, 7, 6, 5, 4, 3, 2, 1);
+  __m256i shift_2 = lasx_set_i32(7, 7, 7, 6, 5, 4, 3, 2);
+  __m256i shift_3 = lasx_set_i32(6, 5, 4, 3, 2, 1, 0, 7);
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  __m256 zero = lasx_set1_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      __m256 v_bias = lasx_set1_f32(bias_val);
+      const float *wei_ptr = weights + c * w_stride;
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+
+      for (int i = 0; i < h_out; i += 4) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+
+        //! process top pad
+        if (i == 0 && pad == 1) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          din_ptr4 = dr3;
+          din_ptr5 = dr4;
+          dr0 = dr3;
+          dr1 = dr4;
+          dr2 = dr5;
+        } else {
+          dr0 = dr4;
+          dr1 = dr5;
+          dr2 = dr1 + w_in;
+        }
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 + (1 - pad) > h_in) {
+          switch (i + 5 + (1 - pad) - h_in) {
+            case 5:
+              din_ptr1 = zero_ptr;
+            case 4:
+              din_ptr2 = zero_ptr;
+            case 3:
+              din_ptr3 = zero_ptr;
+            case 2:
+              din_ptr4 = zero_ptr;
+            case 1:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        for (int j = 0; j < col; j += 1) {
+          __m256 i0 = lasx_loadu_f32(din_ptr0);
+          __m256 i1 = lasx_loadu_f32(din_ptr1);
+          __m256 i2 = lasx_loadu_f32(din_ptr2);
+          __m256 i3 = lasx_loadu_f32(din_ptr3);
+          __m256 i4 = lasx_loadu_f32(din_ptr4);
+          __m256 i5 = lasx_loadu_f32(din_ptr5);
+
+          //! process left pad
+          if (j == 0 && pad == 1) {
+            din_ptr0 += 5;
+            din_ptr1 += 5;
+            din_ptr2 += 5;
+            din_ptr3 += 5;
+            din_ptr4 += 5;
+            din_ptr5 += 5;
+            i0 = lasx_blend_f32(zero, i0, 0b01111111);
+            i0 = lasx_permutevar8x32_f32(i0, shift_3);
+            i1 = lasx_blend_f32(zero, i1, 0b01111111);
+            i1 = lasx_permutevar8x32_f32(i1, shift_3);
+            i2 = lasx_blend_f32(zero, i2, 0b01111111);
+            i2 = lasx_permutevar8x32_f32(i2, shift_3);
+            i3 = lasx_blend_f32(zero, i3, 0b01111111);
+            i3 = lasx_permutevar8x32_f32(i3, shift_3);
+            i4 = lasx_blend_f32(zero, i4, 0b01111111);
+            i4 = lasx_permutevar8x32_f32(i4, shift_3);
+            i5 = lasx_blend_f32(zero, i5, 0b01111111);
+            i5 = lasx_permutevar8x32_f32(i5, shift_3);
+          } else {
+            din_ptr0 += 6;
+            din_ptr1 += 6;
+            din_ptr2 += 6;
+            din_ptr3 += 6;
+            din_ptr4 += 6;
+            din_ptr5 += 6;
+          }
+
+          //! process right remain
+          __m256i smask_ = lasx_set_i32(0,
+                                            0,
+                                            0x80000000,
+                                            0x80000000,
+                                            0x80000000,
+                                            0x80000000,
+                                            0x80000000,
+                                            0x80000000);
+          if (j + 1 == col) {
+            __m256 rmask_i = lasx_loadu_f32(rmaskr);
+            i0 = lasx_blendv_f32(zero, i0, rmask_i);
+            i1 = lasx_blendv_f32(zero, i1, rmask_i);
+            i2 = lasx_blendv_f32(zero, i2, rmask_i);
+            i3 = lasx_blendv_f32(zero, i3, rmask_i);
+            i4 = lasx_blendv_f32(zero, i4, rmask_i);
+            i5 = lasx_blendv_f32(zero, i5, rmask_i);
+            dout_ptr = dout_ptr + 4 * w_out;
+            if (right) {
+              smask_ = lasx_set_i32(rmask_o[7],
+                                        rmask_o[6],
+                                        rmask_o[5],
+                                        rmask_o[4],
+                                        rmask_o[3],
+                                        rmask_o[2],
+                                        rmask_o[1],
+                                        rmask_o[0]);
+            }
+          }
+
+          __m256 wei_00 = lasx_set1_f32(*(wei_ptr));
+          __m256 wei_01 = lasx_set1_f32(*(wei_ptr + 1));
+          __m256 wei_02 = lasx_set1_f32(*(wei_ptr + 2));
+
+          // r0 row0
+          __m256 r0 = lasx_fmadd_f32(i0, wei_00, v_bias);
+          __m256 tmp = lasx_permutevar8x32_f32(i0, shift_1);
+          r0 = lasx_fmadd_f32(tmp, wei_01, r0);
+          tmp = lasx_permutevar8x32_f32(i0, shift_2);
+          r0 = lasx_fmadd_f32(tmp, wei_02, r0);
+
+          // r1 row0
+          __m256 r1 = lasx_fmadd_f32(i1, wei_00, v_bias);
+          tmp = lasx_permutevar8x32_f32(i1, shift_1);
+          r1 = lasx_fmadd_f32(tmp, wei_01, r1);
+          tmp = lasx_permutevar8x32_f32(i1, shift_2);
+          r1 = lasx_fmadd_f32(tmp, wei_02, r1);
+
+          // r2 row0
+          __m256 r2 = lasx_fmadd_f32(i2, wei_00, v_bias);
+          tmp = lasx_permutevar8x32_f32(i2, shift_1);
+          r2 = lasx_fmadd_f32(tmp, wei_01, r2);
+          tmp = lasx_permutevar8x32_f32(i2, shift_2);
+          r2 = lasx_fmadd_f32(tmp, wei_02, r2);
+
+          // r3 row0
+          __m256 r3 = lasx_fmadd_f32(i3, wei_00, v_bias);
+          tmp = lasx_permutevar8x32_f32(i3, shift_1);
+          r3 = lasx_fmadd_f32(tmp, wei_01, r3);
+          tmp = lasx_permutevar8x32_f32(i3, shift_2);
+          r3 = lasx_fmadd_f32(tmp, wei_02, r3);
+
+          __m256 wei_10 = lasx_set1_f32(*(wei_ptr + 3));
+          __m256 wei_11 = lasx_set1_f32(*(wei_ptr + 4));
+          __m256 wei_12 = lasx_set1_f32(*(wei_ptr + 5));
+
+          // r0 row0 + row1
+          r0 = lasx_fmadd_f32(i1, wei_10, r0);
+          tmp = lasx_permutevar8x32_f32(i1, shift_1);
+          r0 = lasx_fmadd_f32(tmp, wei_11, r0);
+          tmp = lasx_permutevar8x32_f32(i1, shift_2);
+          r0 = lasx_fmadd_f32(tmp, wei_12, r0);
+
+          // r1 row0 + row1
+          r1 = lasx_fmadd_f32(i2, wei_10, r1);
+          tmp = lasx_permutevar8x32_f32(i2, shift_1);
+          r1 = lasx_fmadd_f32(tmp, wei_11, r1);
+          tmp = lasx_permutevar8x32_f32(i2, shift_2);
+          r1 = lasx_fmadd_f32(tmp, wei_12, r1);
+
+          // r2 row0 + row1
+          r2 = lasx_fmadd_f32(i3, wei_10, r2);
+          tmp = lasx_permutevar8x32_f32(i3, shift_1);
+          r2 = lasx_fmadd_f32(tmp, wei_11, r2);
+          tmp = lasx_permutevar8x32_f32(i3, shift_2);
+          r2 = lasx_fmadd_f32(tmp, wei_12, r2);
+
+          // r3 row0 + row1
+          r3 = lasx_fmadd_f32(i4, wei_10, r3);
+          tmp = lasx_permutevar8x32_f32(i4, shift_1);
+          r3 = lasx_fmadd_f32(tmp, wei_11, r3);
+          tmp = lasx_permutevar8x32_f32(i4, shift_2);
+          r3 = lasx_fmadd_f32(tmp, wei_12, r3);
+
+          __m256 wei_20 = lasx_set1_f32(*(wei_ptr + 6));
+          __m256 wei_21 = lasx_set1_f32(*(wei_ptr + 7));
+          __m256 wei_22 = lasx_set1_f32(*(wei_ptr + 8));
+
+          // r0 row0 + row1 + row2
+          r0 = lasx_fmadd_f32(i2, wei_20, r0);
+          tmp = lasx_permutevar8x32_f32(i2, shift_1);
+          r0 = lasx_fmadd_f32(tmp, wei_21, r0);
+          tmp = lasx_permutevar8x32_f32(i2, shift_2);
+          r0 = lasx_fmadd_f32(tmp, wei_22, r0);
+
+          // r1 row0 + row1 + row2
+          r1 = lasx_fmadd_f32(i3, wei_20, r1);
+          tmp = lasx_permutevar8x32_f32(i3, shift_1);
+          r1 = lasx_fmadd_f32(tmp, wei_21, r1);
+          tmp = lasx_permutevar8x32_f32(i3, shift_2);
+          r1 = lasx_fmadd_f32(tmp, wei_22, r1);
+
+          // r2 row0 + row1 + row2
+          r2 = lasx_fmadd_f32(i4, wei_20, r2);
+          tmp = lasx_permutevar8x32_f32(i4, shift_1);
+          r2 = lasx_fmadd_f32(tmp, wei_21, r2);
+          tmp = lasx_permutevar8x32_f32(i4, shift_2);
+          r2 = lasx_fmadd_f32(tmp, wei_22, r2);
+
+          // r3 row0 + row1 + row2
+          r3 = lasx_fmadd_f32(i5, wei_20, r3);
+          tmp = lasx_permutevar8x32_f32(i5, shift_1);
+          r3 = lasx_fmadd_f32(tmp, wei_21, r3);
+          tmp = lasx_permutevar8x32_f32(i5, shift_2);
+          r3 = lasx_fmadd_f32(tmp, wei_22, r3);
+
+          if (has_active) {
+            if (act_type == lite_api::ActivationType::kRelu) {
+              r0 = lasx_max_f32(r0, zero);
+              r1 = lasx_max_f32(r1, zero);
+              r2 = lasx_max_f32(r2, zero);
+              r3 = lasx_max_f32(r3, zero);
+            } else if (act_type == lite_api::ActivationType::kRelu6) {
+              __m256 six = lasx_set1_f32(act_param.Relu_clipped_coef);
+              r0 = lasx_min_f32(lasx_max_f32(r0, zero), six);
+              r1 = lasx_min_f32(lasx_max_f32(r1, zero), six);
+              r2 = lasx_min_f32(lasx_max_f32(r2, zero), six);
+              r3 = lasx_min_f32(lasx_max_f32(r3, zero), six);
+            } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+              __m256 negative_slope =
+                  lasx_set1_f32(act_param.Leaky_relu_alpha);
+              r0 = lasx_add_f32(
+                  lasx_and_f32(lasx_cmp_f32(zero, r0, 18), r0),
+                  lasx_mul_f32(lasx_and_f32(lasx_cmp_f32(r0, zero, 17), r0),
+                                negative_slope));
+              r1 = lasx_add_f32(
+                  lasx_and_f32(lasx_cmp_f32(zero, r1, 18), r1),
+                  lasx_mul_f32(lasx_and_f32(lasx_cmp_f32(r1, zero, 17), r1),
+                                negative_slope));
+              r2 = lasx_add_f32(
+                  lasx_and_f32(lasx_cmp_f32(zero, r2, 18), r2),
+                  lasx_mul_f32(lasx_and_f32(lasx_cmp_f32(r2, zero, 17), r2),
+                                negative_slope));
+              r3 = lasx_add_f32(
+                  lasx_and_f32(lasx_cmp_f32(zero, r3, 18), r3),
+                  lasx_mul_f32(lasx_and_f32(lasx_cmp_f32(r3, zero, 17), r3),
+                                negative_slope));
+            } else if (act_type == lite_api::ActivationType::kHardSwish) {
+              __m256 vscale = lasx_set1_f32(1.0 / act_param.hard_swish_scale);
+              __m256 voffset = lasx_set1_f32(act_param.hard_swish_offset);
+              __m256 vthreshold =
+                  lasx_set1_f32(act_param.hard_swish_threshold);
+              r0 = lasx_mul_f32(
+                  lasx_min_f32(
+                      vthreshold,
+                      lasx_max_f32(zero, lasx_add_f32(r0, voffset))),
+                  lasx_mul_f32(r0, vscale));
+              r1 = lasx_mul_f32(
+                  lasx_min_f32(
+                      vthreshold,
+                      lasx_max_f32(zero, lasx_add_f32(r1, voffset))),
+                  lasx_mul_f32(r1, vscale));
+              r2 = lasx_mul_f32(
+                  lasx_min_f32(
+                      vthreshold,
+                      lasx_max_f32(zero, lasx_add_f32(r2, voffset))),
+                  lasx_mul_f32(r2, vscale));
+              r3 = lasx_mul_f32(
+                  lasx_min_f32(
+                      vthreshold,
+                      lasx_max_f32(zero, lasx_add_f32(r3, voffset))),
+                  lasx_mul_f32(r3, vscale));
+            } else {
+              LOG(FATAL) << "[LoongArch] activation type: "
+                         << static_cast<int>(act_type) << "not supported";
+            }
+          }
+
+          lasx_maskstore_f32(doutr0, smask_, r0);
+          lasx_maskstore_f32(doutr1, smask_, r1);
+          lasx_maskstore_f32(doutr2, smask_, r2);
+          lasx_maskstore_f32(doutr3, smask_, r3);
+
+          doutr0 = doutr0 + 6;
+          doutr1 = doutr1 + 6;
+          doutr2 = doutr2 + 6;
+          doutr3 = doutr3 + 6;
+        }
+      }
+    }
+  }
+
+  TargetFree(TARGET(kLoongArch), zero_ptr);
+  TargetFree(TARGET(kLoongArch), write_ptr);
+#else
+  bool right = false;  // for right result
+
+  bool has_active = act_param.has_active;
+  auto act_type = act_param.active_type;
+
+  float *zero_ptr = static_cast<float *>(
+      TargetMalloc(TARGET(kLoongArch), Max(w_in * sizeof(float), 8)));
+  memset(zero_ptr, 0, Max(w_in * sizeof(float), 8));
+  float *write_ptr =
+      static_cast<float *>(TargetMalloc(TARGET(kLoongArch), w_out * sizeof(float)));
+
+  //! prepare for processing right result
+  float rmasko[4] = {1.f, 1.f, 1.f, 1.f};
+  float rmaskr[8] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f};
+  int r = w_out % 4;
+  int col = w_out / 4;
+  if (r > 0) col++;
+  if (r > 0) {
+    for (int i = 0; i < 4; i++) {
+      if (i < r) {
+        rmasko[i] = -1.f;
+      }
+    }
+    right = true;
+  }
+  if (r > 0) {
+    for (int i = 0; i < 8; i++) {
+      if (i <= r + 1 - pad) {
+        rmaskr[i] = -1.f;
+      }
+    }
+  } else {
+    for (int i = 0; i < 5 + (1 - pad); i++) {
+      rmaskr[i] = -1.f;
+    }
+  }
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  __m128 zero = lsx_set1_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      __m128 v_bias = lsx_set1_f32(bias_val);
+      const float *wei_ptr = weights + c * w_stride;
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+
+      for (int i = 0; i < h_out; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+
+        //! process top pad
+        if (i == 0 && pad == 1) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          dr0 = dr1;
+        } else {
+          dr0 = dr2;
+        }
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+
+        //! process bottom pad
+        if (i + 3 + (1 - pad) > h_in) {
+          switch (i + 3 + (1 - pad) - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          switch (i + 2 - h_out) {
+            case 1:
+              doutr1 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        for (int j = 0; j < col; j += 1) {
+          __m128 i0_0 = lsx_loadu_f32(din_ptr0);
+          __m128 i0_1 = lsx_loadu_f32(din_ptr0 + 4);
+          __m128 i1_0 = lsx_loadu_f32(din_ptr1);
+          __m128 i1_1 = lsx_loadu_f32(din_ptr1 + 4);
+          __m128 i2_0 = lsx_loadu_f32(din_ptr2);
+          __m128 i2_1 = lsx_loadu_f32(din_ptr2 + 4);
+          __m128 i3_0 = lsx_loadu_f32(din_ptr3);
+          __m128 i3_1 = lsx_loadu_f32(din_ptr3 + 4);
+
+          //! process left pad
+          if (j == 0 && pad == 1) {
+            __m128 tmp0 = lsx_blend_f32(zero, i0_0, 0b0111);
+            tmp0 = lsx_shuffle_f32(tmp0, tmp0, 0b10010011);
+            i0_1 = lsx_blend_f32(i0_0, i0_1, 0b0111);
+            i0_1 = lsx_shuffle_f32(i0_1, i0_1, 0b10010011);
+            i0_0 = tmp0;
+
+            tmp0 = lsx_blend_f32(zero, i1_0, 0b0111);
+            tmp0 = lsx_shuffle_f32(tmp0, tmp0, 0b10010011);
+            i1_1 = lsx_blend_f32(i1_0, i1_1, 0b0111);
+            i1_1 = lsx_shuffle_f32(i1_1, i1_1, 0b10010011);
+            i1_0 = tmp0;
+
+            tmp0 = lsx_blend_f32(zero, i2_0, 0b0111);
+            tmp0 = lsx_shuffle_f32(tmp0, tmp0, 0b10010011);
+            i2_1 = lsx_blend_f32(i2_0, i2_1, 0b0111);
+            i2_1 = lsx_shuffle_f32(i2_1, i2_1, 0b10010011);
+            i2_0 = tmp0;
+
+            tmp0 = lsx_blend_f32(zero, i3_0, 0b0111);
+            tmp0 = lsx_shuffle_f32(tmp0, tmp0, 0b10010011);
+            i3_1 = lsx_blend_f32(i3_0, i3_1, 0b0111);
+            i3_1 = lsx_shuffle_f32(i3_1, i3_1, 0b10010011);
+            i3_0 = tmp0;
+
+            din_ptr0 += 3;
+            din_ptr1 += 3;
+            din_ptr2 += 3;
+            din_ptr3 += 3;
+          } else {
+            din_ptr0 += 4;
+            din_ptr1 += 4;
+            din_ptr2 += 4;
+            din_ptr3 += 4;
+          }
+
+          //! process right remain
+          if (j + 1 == col) {
+            dout_ptr = dout_ptr + 2 * w_out;
+            __m128 rmask_i = lsx_loadu_f32(rmaskr);
+            i0_0 = lsx_blendv_f32(zero, i0_0, rmask_i);
+            i1_0 = lsx_blendv_f32(zero, i1_0, rmask_i);
+            i2_0 = lsx_blendv_f32(zero, i2_0, rmask_i);
+            i3_0 = lsx_blendv_f32(zero, i3_0, rmask_i);
+
+            rmask_i = lsx_loadu_f32(rmaskr + 4);
+            i0_1 = lsx_blendv_f32(zero, i0_1, rmask_i);
+            i1_1 = lsx_blendv_f32(zero, i1_1, rmask_i);
+            i2_1 = lsx_blendv_f32(zero, i2_1, rmask_i);
+            i3_1 = lsx_blendv_f32(zero, i3_1, rmask_i);
+          }
+
+          __m128 wei_00 = lsx_load1_f32(wei_ptr);
+          __m128 wei_01 = lsx_load1_f32(wei_ptr + 1);
+          __m128 wei_02 = lsx_load1_f32(wei_ptr + 2);
+
+          // r0 row0
+          __m128 r0 = lsx_mul_f32(i0_0, wei_00);
+          r0 = lsx_add_f32(r0, v_bias);
+          __m128 tmp = lsx_blend_f32(i0_0, i0_1, 0b0001);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b00111001);
+          tmp = lsx_mul_f32(tmp, wei_01);
+          r0 = lsx_add_f32(tmp, r0);
+          tmp = lsx_blend_f32(i0_0, i0_1, 0b0011);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b01001110);
+          tmp = lsx_mul_f32(tmp, wei_02);
+          r0 = lsx_add_f32(tmp, r0);
+
+          // r1 row0
+          __m128 r1 = lsx_mul_f32(i1_0, wei_00);
+          r1 = lsx_add_f32(r1, v_bias);
+          tmp = lsx_blend_f32(i1_0, i1_1, 0b0001);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b00111001);
+          tmp = lsx_mul_f32(tmp, wei_01);
+          r1 = lsx_add_f32(tmp, r1);
+          tmp = lsx_blend_f32(i1_0, i1_1, 0b0011);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b01001110);
+          tmp = lsx_mul_f32(tmp, wei_02);
+          r1 = lsx_add_f32(tmp, r1);
+
+          __m128 wei_10 = lsx_load1_f32(wei_ptr + 3);
+          __m128 wei_11 = lsx_load1_f32(wei_ptr + 4);
+          __m128 wei_12 = lsx_load1_f32(wei_ptr + 5);
+
+          // r0 row0 + row1
+          tmp = lsx_mul_f32(i1_0, wei_10);
+          r0 = lsx_add_f32(r0, tmp);
+          tmp = lsx_blend_f32(i1_0, i1_1, 0b0001);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b00111001);
+          tmp = lsx_mul_f32(tmp, wei_11);
+          r0 = lsx_add_f32(tmp, r0);
+          tmp = lsx_blend_f32(i1_0, i1_1, 0b0011);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b01001110);
+          tmp = lsx_mul_f32(tmp, wei_12);
+          r0 = lsx_add_f32(tmp, r0);
+
+          // r1 row0 + row1
+          tmp = lsx_mul_f32(i2_0, wei_10);
+          r1 = lsx_add_f32(r1, tmp);
+          tmp = lsx_blend_f32(i2_0, i2_1, 0b0001);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b00111001);
+          tmp = lsx_mul_f32(tmp, wei_11);
+          r1 = lsx_add_f32(tmp, r1);
+          tmp = lsx_blend_f32(i2_0, i2_1, 0b0011);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b01001110);
+          tmp = lsx_mul_f32(tmp, wei_12);
+          r1 = lsx_add_f32(tmp, r1);
+
+          __m128 wei_20 = lsx_load1_f32(wei_ptr + 6);
+          __m128 wei_21 = lsx_load1_f32(wei_ptr + 7);
+          __m128 wei_22 = lsx_load1_f32(wei_ptr + 8);
+
+          // r0 row0 + row1 + row2
+          tmp = lsx_mul_f32(i2_0, wei_20);
+          r0 = lsx_add_f32(r0, tmp);
+          tmp = lsx_blend_f32(i2_0, i2_1, 0b0001);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b00111001);
+          tmp = lsx_mul_f32(tmp, wei_21);
+          r0 = lsx_add_f32(tmp, r0);
+          tmp = lsx_blend_f32(i2_0, i2_1, 0b0011);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b01001110);
+          tmp = lsx_mul_f32(tmp, wei_22);
+          r0 = lsx_add_f32(tmp, r0);
+
+          // r1 row0 + row1 + row2
+          tmp = lsx_mul_f32(i3_0, wei_20);
+          r1 = lsx_add_f32(r1, tmp);
+          tmp = lsx_blend_f32(i3_0, i3_1, 0b0001);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b00111001);
+          tmp = lsx_mul_f32(tmp, wei_21);
+          r1 = lsx_add_f32(tmp, r1);
+          tmp = lsx_blend_f32(i3_0, i3_1, 0b0011);
+          tmp = lsx_shuffle_f32(tmp, tmp, 0b01001110);
+          tmp = lsx_mul_f32(tmp, wei_22);
+          r1 = lsx_add_f32(tmp, r1);
+
+          if (has_active) {  // process activation
+            if (act_type == lite_api::ActivationType::kRelu) {
+              r0 = lsx_max_f32(r0, zero);
+              r1 = lsx_max_f32(r1, zero);
+            } else if (act_type == lite_api::ActivationType::kRelu6) {
+              __m128 six = lsx_set1_f32(act_param.Relu_clipped_coef);
+              r0 = lsx_min_f32(lsx_max_f32(r0, zero), six);
+              r1 = lsx_min_f32(lsx_max_f32(r1, zero), six);
+            } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+              __m128 negative_slope = lsx_set1_f32(act_param.Leaky_relu_alpha);
+              r0 = lsx_add_f32(lsx_and_f32(lsx_cmple_f32(zero, r0), r0),
+                              lsx_mul_f32(lsx_and_f32(lsx_cmplt_f32(r0, zero), r0),
+                                         negative_slope));
+              r1 = lsx_add_f32(lsx_and_f32(lsx_cmple_f32(zero, r1), r1),
+                              lsx_mul_f32(lsx_and_f32(lsx_cmplt_f32(r1, zero), r1),
+                                         negative_slope));
+            } else if (act_type == lite_api::ActivationType::kHardSwish) {
+              __m128 vscale = lsx_set1_f32(1.0 / act_param.hard_swish_scale);
+              __m128 voffset = lsx_set1_f32(act_param.hard_swish_offset);
+              __m128 vthreshold = lsx_set1_f32(act_param.hard_swish_threshold);
+              r0 = lsx_mul_f32(
+                  lsx_min_f32(vthreshold,
+                             lsx_max_f32(zero, lsx_add_f32(r0, voffset))),
+                  lsx_mul_f32(r0, vscale));
+              r1 = lsx_mul_f32(
+                  lsx_min_f32(vthreshold,
+                             lsx_max_f32(zero, lsx_add_f32(r1, voffset))),
+                  lsx_mul_f32(r1, vscale));
+            } else {
+              LOG(FATAL) << "[LoongArch] activation type: "
+                         << static_cast<int>(act_type) << "not supported";
+            }
+          }
+
+          //! process bottom pad
+          if (j + 1 == col && right) {
+            __m128 out0 = lsx_loadu_f32(doutr0);
+            __m128 out1 = lsx_loadu_f32(doutr1);
+            __m128 rmask_ro = lsx_loadu_f32(rmasko);
+            r0 = lsx_blendv_f32(out0, r0, rmask_ro);
+            r1 = lsx_blendv_f32(out1, r1, rmask_ro);
+          }
+
+          lsx_storeu_f32(doutr0, r0);
+          lsx_storeu_f32(doutr1, r1);
+
+          doutr0 += 4;
+          doutr1 += 4;
+        }
+      }
+    }
+  }
+  TargetFree(TARGET(kLoongArch), zero_ptr);
+  TargetFree(TARGET(kLoongArch), write_ptr);
+#endif
+}
+
+void conv_depthwise_3x3_pack(const operators::ConvParam &param,
+                             lite::Tensor *input_padding_,
+                             lite::Tensor *input_pack_,
+                             lite::Tensor *filter_pack_,
+                             lite::Tensor *output_pack_) {
+  auto input_dims = param.x->dims();
+  CHECK_EQ(input_dims.size(), 4UL);
+  int batch_size = param.x->dims()[0];
+  int input_channel = param.x->dims()[1];
+
+#if __loongarch_asx
+  const int pack_size =
+      input_channel % 8 == 0 ? 8 : input_channel % 4 == 0 ? 4 : 1;
+#else
+  const int pack_size = input_channel % 4 == 0 ? 4 : 1;
+#endif
+
+  const int pack_num = input_channel / pack_size;
+
+#if __loongarch_asx
+  if (pack_size == 8) {
+    pack_padding8_m256(param.x, input_padding_, pack_num, *(param.paddings));
+  } else if (pack_size == 4) {
+    pack4_m128(param.x, input_pack_, pack_num, false);
+    padding4_m128(input_pack_, input_padding_, *(param.paddings));
+#else
+  if (pack_size == 4) {
+    pack4_m128(param.x, input_pack_, pack_num, false);
+    padding4_m128(input_pack_, input_padding_, *(param.paddings));
+#endif
+  } else {
+    padding1_float(param.x, input_padding_, *(param.paddings));
+  }
+
+  // filter [oc, ic/groups=1, kh, kw]
+  auto filter_dims = param.filter->dims();
+  CHECK_EQ(filter_dims.size(), 4UL);
+  int kernel_h = param.filter->dims()[2];
+  int kernel_w = param.filter->dims()[3];
+
+  // filter [oc, 1, ih, iw] & pack_size=8 => [oc/8, ih, iw, 8]
+  // filter [oc, 1, ih, iw] & pack_size=4 => [ic/4, ih, iw, 4]
+#if __loongarch_asx
+  if (pack_size == 8) {
+    pack8_m256(param.filter, filter_pack_, pack_num, true);
+  } else if (pack_size == 4) {
+    pack4_m128(param.filter, filter_pack_, pack_num, true);
+  }
+#else
+  if (pack_size == 4) {
+    pack4_m128(param.filter, filter_pack_, pack_num, true);
+  }
+#endif
+
+  // attributes
+  const int stride_h = param.strides[0];
+  const int stride_w = param.strides[1];
+  const int dilation_h = (*param.dilations)[0];
+  const int dilation_w = (*param.dilations)[1];
+
+  // act type
+  auto act_param = param.activation_param;
+  bool has_act = act_param.has_active;
+  auto act_type = act_param.active_type;
+
+  // output [bs, oc, oh, ow]
+  CHECK_EQ(param.output->dims().size(), 4UL);
+  const int in_h = input_padding_->dims()[2], in_w = input_padding_->dims()[3];
+  const int kernel_extend_h = dilation_h * (kernel_h - 1) + 1;
+  const int kernel_extend_w = dilation_w * (kernel_w - 1) + 1;
+  int output_height = (in_h - kernel_extend_h) / stride_h + 1;
+  int output_width = (in_w - kernel_extend_w) / stride_w + 1;
+  // output_trans [bs, oc/8, oh, ow, 8]
+  // output_trans [bs, oc/4, oh, ow, 4]
+  output_pack_->Resize(
+      {batch_size, pack_num, output_height, output_width, pack_size});
+#if __loongarch_asx
+  if (pack_size == 8) {
+    if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+        dilation_h == 1 && dilation_w == 1) {
+      conv_depthwise_3x3s1_m256(input_padding_,
+                                output_pack_,
+                                filter_pack_,
+                                param.bias,
+                                has_act,
+                                act_type,
+                                act_param);
+    } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 &&
+               stride_w == 2 && dilation_h == 1 && dilation_w == 1) {
+      conv_depthwise_3x3s2_m256(input_padding_,
+                                output_pack_,
+                                filter_pack_,
+                                param.bias,
+                                has_act,
+                                act_type,
+                                act_param);
+    } else {
+      conv_depthwise_m256(input_padding_,
+                          output_pack_,
+                          filter_pack_,
+                          param.bias,
+                          stride_h,
+                          stride_w,
+                          dilation_h,
+                          dilation_w,
+                          has_act,
+                          act_type,
+                          act_param);
+    }
+  }
+#else
+  if (pack_size == 4) {
+    conv_depthwise_m128(input_padding_,
+                        output_pack_,
+                        filter_pack_,
+                        param.bias,
+                        stride_h,
+                        stride_w,
+                        dilation_h,
+                        dilation_w,
+                        has_act,
+                        act_type,
+                        act_param);
+  }
+#endif
+
+#if __loongarch_asx
+  // [bs, oh, ow, oc] => [bs, oc, oh, ow]
+  if (pack_size == 8) {
+    unpack8_m256(output_pack_, param.output);
+  }
+#else
+  if (pack_size == 4) {
+    unpack4_m128(output_pack_, param.output);
+  }
+#endif
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv_depthwise_5x5.cc b/lite/backends/loongarch/math/conv_depthwise_5x5.cc
new file mode 100644
index 00000000000..62369bd1775
--- /dev/null
+++ b/lite/backends/loongarch/math/conv_depthwise_5x5.cc
@@ -0,0 +1,807 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/conv_depthwise_5x5.h"
+#include <vector>
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+#include "lite/backends/loongarch/math/common/conv_utils.h"
+#include "lite/backends/loongarch/math/conv_depthwise_impl.h"
+#include "lite/core/memory.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+#define Min(a, b) (a < b ? a : b)
+#define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
+void conv_depthwise_5x5s1(const float* din,
+                          float* dout,
+                          int num,
+                          int ch_out,
+                          int h_out,
+                          int w_out,
+                          int ch_in,
+                          int h_in,
+                          int w_in,
+                          const float* weights,
+                          const float* bias,
+                          int pad,
+                          bool flag_bias,
+                          const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  auto act_type = act_param.active_type;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int in_len = block_channel * (2 * pad + w_in);
+
+  int channel_num = ROUNDUP(ch_in, block_channel);
+  float* pack_weight = static_cast<float*>(
+      TargetMalloc(TARGET(kLoongArch), channel_num * 5 * 5 * sizeof(float)));
+  float* pack_input = static_cast<float*>(TargetMalloc(
+      TARGET(kLoongArch),
+      (h_in + 2 * pad) * (w_in + 2 * pad) * block_channel * sizeof(float)));
+  float* pack_out = static_cast<float*>(TargetMalloc(
+      TARGET(kLoongArch), h_out * w_out * block_channel * sizeof(float)));
+
+#ifdef __loongarch_asx
+  packC8_common(weights, pack_weight, {0, 0, 0, 0}, 5, 5, ch_in);
+#else
+  packC4_common(weights, pack_weight, {0, 0, 0, 0}, 5, 5, ch_in);
+#endif
+
+  for (int n = 0; n < num; n++) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_out * size_out_channel;
+
+    for (int c = 0; c < ch_out; c += block_channel) {
+      int real_block_channel = Min(block_channel, ch_out - c);
+      auto* dout_ptr = dout_batch + c * size_out_channel;
+      auto* din_ptr = din_batch + c * size_in_channel;
+      auto* weights_data = pack_weight + c * 5 * 5;
+
+#ifdef __loongarch_asx
+      packC8_common(din_ptr,
+                    pack_input,
+                    {pad, pad, pad, pad},
+                    h_in,
+                    w_in,
+                    real_block_channel);
+#else
+      packC4_common(din_ptr,
+                    pack_input,
+                    {pad, pad, pad, pad},
+                    h_in,
+                    w_in,
+                    real_block_channel);
+#endif
+
+      float bias_ptr[block_channel] = {0.f};
+      if (flag_bias) {
+        for (int i = 0; i < block_channel; i++) {
+          if (real_block_channel > i) {
+            bias_ptr[i] = *(bias + c + i);
+          }
+        }
+      }
+
+      Type _bias = loadu_ps(bias_ptr);
+
+      for (int i = 0; i < h_out; i++) {
+        const float* block_inr0 = pack_input + i * in_len;
+        const float* block_inr1 = block_inr0 + in_len;
+        const float* block_inr2 = block_inr1 + in_len;
+        const float* block_inr3 = block_inr2 + in_len;
+        const float* block_inr4 = block_inr3 + in_len;
+        int j = 0;
+        float* dout_block = pack_out + i * w_out * block_channel;
+        for (; j + 3 < w_out; j += 4) {
+          Type i00 = loadu_ps(block_inr0);
+          Type i01 = loadu_ps(block_inr0 + 1 * block_channel);
+          Type i02 = loadu_ps(block_inr0 + 2 * block_channel);
+          Type i03 = loadu_ps(block_inr0 + 3 * block_channel);
+          Type i04 = loadu_ps(block_inr0 + 4 * block_channel);
+          Type i05 = loadu_ps(block_inr0 + 5 * block_channel);
+          Type i06 = loadu_ps(block_inr0 + 6 * block_channel);
+          Type i07 = loadu_ps(block_inr0 + 7 * block_channel);
+
+          Type w00 = loadu_ps(weights_data);
+          Type r0 = fmadd_ps(i00, w00, _bias);
+          Type r1 = fmadd_ps(i01, w00, _bias);
+          Type r2 = fmadd_ps(i02, w00, _bias);
+          Type r3 = fmadd_ps(i03, w00, _bias);
+
+          Type w01 = loadu_ps(weights_data + block_channel);
+          r0 = fmadd_ps(i01, w01, r0);
+          r1 = fmadd_ps(i02, w01, r1);
+          r2 = fmadd_ps(i03, w01, r2);
+          r3 = fmadd_ps(i04, w01, r3);
+
+          Type w02 = loadu_ps(weights_data + 2 * block_channel);
+          r0 = fmadd_ps(i02, w02, r0);
+          r1 = fmadd_ps(i03, w02, r1);
+          r2 = fmadd_ps(i04, w02, r2);
+          r3 = fmadd_ps(i05, w02, r3);
+
+          Type w03 = loadu_ps(weights_data + 3 * block_channel);
+          r0 = fmadd_ps(i03, w03, r0);
+          r1 = fmadd_ps(i04, w03, r1);
+          r2 = fmadd_ps(i05, w03, r2);
+          r3 = fmadd_ps(i06, w03, r3);
+
+          Type w04 = loadu_ps(weights_data + 4 * block_channel);
+          r0 = fmadd_ps(i04, w04, r0);
+          r1 = fmadd_ps(i05, w04, r1);
+          r2 = fmadd_ps(i06, w04, r2);
+          r3 = fmadd_ps(i07, w04, r3);
+
+          Type i10 = loadu_ps(block_inr1);
+          Type i11 = loadu_ps(block_inr1 + 1 * block_channel);
+          Type i12 = loadu_ps(block_inr1 + 2 * block_channel);
+          Type i13 = loadu_ps(block_inr1 + 3 * block_channel);
+          Type i14 = loadu_ps(block_inr1 + 4 * block_channel);
+          Type i15 = loadu_ps(block_inr1 + 5 * block_channel);
+          Type i16 = loadu_ps(block_inr1 + 6 * block_channel);
+          Type i17 = loadu_ps(block_inr1 + 7 * block_channel);
+
+          Type w10 = loadu_ps(weights_data + 5 * block_channel);
+          r0 = fmadd_ps(i10, w10, r0);
+          r1 = fmadd_ps(i11, w10, r1);
+          r2 = fmadd_ps(i12, w10, r2);
+          r3 = fmadd_ps(i13, w10, r3);
+
+          Type w11 = loadu_ps(weights_data + 6 * block_channel);
+          r0 = fmadd_ps(i11, w11, r0);
+          r1 = fmadd_ps(i12, w11, r1);
+          r2 = fmadd_ps(i13, w11, r2);
+          r3 = fmadd_ps(i14, w11, r3);
+
+          Type w12 = loadu_ps(weights_data + 7 * block_channel);
+          r0 = fmadd_ps(i12, w12, r0);
+          r1 = fmadd_ps(i13, w12, r1);
+          r2 = fmadd_ps(i14, w12, r2);
+          r3 = fmadd_ps(i15, w12, r3);
+
+          Type w13 = loadu_ps(weights_data + 8 * block_channel);
+          r0 = fmadd_ps(i13, w13, r0);
+          r1 = fmadd_ps(i14, w13, r1);
+          r2 = fmadd_ps(i15, w13, r2);
+          r3 = fmadd_ps(i16, w13, r3);
+
+          Type w14 = loadu_ps(weights_data + 9 * block_channel);
+          r0 = fmadd_ps(i14, w14, r0);
+          r1 = fmadd_ps(i15, w14, r1);
+          r2 = fmadd_ps(i16, w14, r2);
+          r3 = fmadd_ps(i17, w14, r3);
+
+          Type i20 = loadu_ps(block_inr2);
+          Type i21 = loadu_ps(block_inr2 + 1 * block_channel);
+          Type i22 = loadu_ps(block_inr2 + 2 * block_channel);
+          Type i23 = loadu_ps(block_inr2 + 3 * block_channel);
+          Type i24 = loadu_ps(block_inr2 + 4 * block_channel);
+          Type i25 = loadu_ps(block_inr2 + 5 * block_channel);
+          Type i26 = loadu_ps(block_inr2 + 6 * block_channel);
+          Type i27 = loadu_ps(block_inr2 + 7 * block_channel);
+
+          Type w20 = loadu_ps(weights_data + 10 * block_channel);
+          r0 = fmadd_ps(i20, w20, r0);
+          r1 = fmadd_ps(i21, w20, r1);
+          r2 = fmadd_ps(i22, w20, r2);
+          r3 = fmadd_ps(i23, w20, r3);
+
+          Type w21 = loadu_ps(weights_data + 11 * block_channel);
+          r0 = fmadd_ps(i21, w21, r0);
+          r1 = fmadd_ps(i22, w21, r1);
+          r2 = fmadd_ps(i23, w21, r2);
+          r3 = fmadd_ps(i24, w21, r3);
+
+          Type w22 = loadu_ps(weights_data + 12 * block_channel);
+          r0 = fmadd_ps(i22, w22, r0);
+          r1 = fmadd_ps(i23, w22, r1);
+          r2 = fmadd_ps(i24, w22, r2);
+          r3 = fmadd_ps(i25, w22, r3);
+
+          Type w23 = loadu_ps(weights_data + 13 * block_channel);
+          r0 = fmadd_ps(i23, w23, r0);
+          r1 = fmadd_ps(i24, w23, r1);
+          r2 = fmadd_ps(i25, w23, r2);
+          r3 = fmadd_ps(i26, w23, r3);
+
+          Type w24 = loadu_ps(weights_data + 14 * block_channel);
+          r0 = fmadd_ps(i24, w24, r0);
+          r1 = fmadd_ps(i25, w24, r1);
+          r2 = fmadd_ps(i26, w24, r2);
+          r3 = fmadd_ps(i27, w24, r3);
+
+          Type i30 = loadu_ps(block_inr3);
+          Type i31 = loadu_ps(block_inr3 + 1 * block_channel);
+          Type i32 = loadu_ps(block_inr3 + 2 * block_channel);
+          Type i33 = loadu_ps(block_inr3 + 3 * block_channel);
+          Type i34 = loadu_ps(block_inr3 + 4 * block_channel);
+          Type i35 = loadu_ps(block_inr3 + 5 * block_channel);
+          Type i36 = loadu_ps(block_inr3 + 6 * block_channel);
+          Type i37 = loadu_ps(block_inr3 + 7 * block_channel);
+
+          Type w30 = loadu_ps(weights_data + 15 * block_channel);
+          r0 = fmadd_ps(i30, w30, r0);
+          r1 = fmadd_ps(i31, w30, r1);
+          r2 = fmadd_ps(i32, w30, r2);
+          r3 = fmadd_ps(i33, w30, r3);
+
+          Type w31 = loadu_ps(weights_data + 16 * block_channel);
+          r0 = fmadd_ps(i31, w31, r0);
+          r1 = fmadd_ps(i32, w31, r1);
+          r2 = fmadd_ps(i33, w31, r2);
+          r3 = fmadd_ps(i34, w31, r3);
+
+          Type w32 = loadu_ps(weights_data + 17 * block_channel);
+          r0 = fmadd_ps(i32, w32, r0);
+          r1 = fmadd_ps(i33, w32, r1);
+          r2 = fmadd_ps(i34, w32, r2);
+          r3 = fmadd_ps(i35, w32, r3);
+
+          Type w33 = loadu_ps(weights_data + 18 * block_channel);
+          r0 = fmadd_ps(i33, w33, r0);
+          r1 = fmadd_ps(i34, w33, r1);
+          r2 = fmadd_ps(i35, w33, r2);
+          r3 = fmadd_ps(i36, w33, r3);
+
+          Type w34 = loadu_ps(weights_data + 19 * block_channel);
+          r0 = fmadd_ps(i34, w34, r0);
+          r1 = fmadd_ps(i35, w34, r1);
+          r2 = fmadd_ps(i36, w34, r2);
+          r3 = fmadd_ps(i37, w34, r3);
+
+          Type i40 = loadu_ps(block_inr4);
+          Type i41 = loadu_ps(block_inr4 + 1 * block_channel);
+          Type i42 = loadu_ps(block_inr4 + 2 * block_channel);
+          Type i43 = loadu_ps(block_inr4 + 3 * block_channel);
+          Type i44 = loadu_ps(block_inr4 + 4 * block_channel);
+          Type i45 = loadu_ps(block_inr4 + 5 * block_channel);
+          Type i46 = loadu_ps(block_inr4 + 6 * block_channel);
+          Type i47 = loadu_ps(block_inr4 + 7 * block_channel);
+
+          Type w40 = loadu_ps(weights_data + 20 * block_channel);
+          r0 = fmadd_ps(i40, w40, r0);
+          r1 = fmadd_ps(i41, w40, r1);
+          r2 = fmadd_ps(i42, w40, r2);
+          r3 = fmadd_ps(i43, w40, r3);
+
+          Type w41 = loadu_ps(weights_data + 21 * block_channel);
+          r0 = fmadd_ps(i41, w41, r0);
+          r1 = fmadd_ps(i42, w41, r1);
+          r2 = fmadd_ps(i43, w41, r2);
+          r3 = fmadd_ps(i44, w41, r3);
+
+          Type w42 = loadu_ps(weights_data + 22 * block_channel);
+          r0 = fmadd_ps(i42, w42, r0);
+          r1 = fmadd_ps(i43, w42, r1);
+          r2 = fmadd_ps(i44, w42, r2);
+          r3 = fmadd_ps(i45, w42, r3);
+
+          Type w43 = loadu_ps(weights_data + 23 * block_channel);
+          r0 = fmadd_ps(i43, w43, r0);
+          r1 = fmadd_ps(i44, w43, r1);
+          r2 = fmadd_ps(i45, w43, r2);
+          r3 = fmadd_ps(i46, w43, r3);
+
+          Type w44 = loadu_ps(weights_data + 24 * block_channel);
+          r0 = fmadd_ps(i44, w44, r0);
+          r1 = fmadd_ps(i45, w44, r1);
+          r2 = fmadd_ps(i46, w44, r2);
+          r3 = fmadd_ps(i47, w44, r3);
+
+          Type zero = setzero_ps();
+          if (has_active) {
+            if (act_type == lite_api::ActivationType::kRelu) {
+              r0 = max_ps(r0, zero);
+              r1 = max_ps(r1, zero);
+              r2 = max_ps(r2, zero);
+              r3 = max_ps(r3, zero);
+            } else if (act_type == lite_api::ActivationType::kRelu6) {
+              Type six = set1_ps(act_param.Relu_clipped_coef);
+              r0 = min_ps(max_ps(r0, zero), six);
+              r1 = min_ps(max_ps(r1, zero), six);
+              r2 = min_ps(max_ps(r2, zero), six);
+              r3 = min_ps(max_ps(r3, zero), six);
+            } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+              Type negative_slope = set1_ps(act_param.Leaky_relu_alpha);
+              r0 = blendv_ps(
+                  r0, mul_ps(negative_slope, r0), cmp_ps(r0, zero, 2));
+              r1 = blendv_ps(
+                  r1, mul_ps(negative_slope, r1), cmp_ps(r1, zero, 2));
+              r2 = blendv_ps(
+                  r2, mul_ps(negative_slope, r2), cmp_ps(r2, zero, 2));
+              r3 = blendv_ps(
+                  r3, mul_ps(negative_slope, r3), cmp_ps(r3, zero, 2));
+            } else if (act_type == lite_api::ActivationType::kHardSwish) {
+              Type vscale = set1_ps(1.0 / act_param.hard_swish_scale);
+              Type voffset = set1_ps(act_param.hard_swish_offset);
+              Type vthreshold = set1_ps(act_param.hard_swish_threshold);
+              r0 = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r0, voffset))),
+                          mul_ps(r0, vscale));
+              r1 = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r1, voffset))),
+                          mul_ps(r1, vscale));
+              r2 = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r2, voffset))),
+                          mul_ps(r2, vscale));
+              r3 = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r3, voffset))),
+                          mul_ps(r3, vscale));
+            } else {
+              LOG(FATAL) << " [LoongArch] activation type "
+                         << static_cast<int>(act_type) << " not supported ";
+            }
+          }
+
+          storeu_ps(dout_block, r0);
+          storeu_ps(dout_block + block_channel, r1);
+          storeu_ps(dout_block + 2 * block_channel, r2);
+          storeu_ps(dout_block + 3 * block_channel, r3);
+          dout_block += 4 * block_channel;
+
+          block_inr0 += 4 * block_channel;
+          block_inr1 = block_inr0 + in_len;
+          block_inr2 = block_inr1 + in_len;
+          block_inr3 = block_inr2 + in_len;
+          block_inr4 = block_inr3 + in_len;
+        }
+
+        for (; j < w_out; j++) {
+          Type r = _bias;
+          for (int m = 0; m < 5; m++) {
+            for (int n = 0; n < 5; n++) {
+              Type weight = loadu_ps(weights_data + 5 * block_channel * m +
+                                     block_channel * n);
+              Type input = loadu_ps(block_inr0 + block_channel * (j % 4) +
+                                    in_len * m + block_channel * n);
+              r = fmadd_ps(input, weight, r);
+            }
+          }
+          Type zero = setzero_ps();
+          if (has_active) {
+            if (act_type == lite_api::ActivationType::kRelu) {
+              r = max_ps(r, zero);
+            } else if (act_type == lite_api::ActivationType::kRelu6) {
+              Type six = set1_ps(act_param.Relu_clipped_coef);
+              r = min_ps(max_ps(r, zero), six);
+            } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+              Type negative_slope = set1_ps(act_param.Leaky_relu_alpha);
+              r = blendv_ps(r, mul_ps(negative_slope, r), cmp_ps(r, zero, 2));
+            } else if (act_type == lite_api::ActivationType::kHardSwish) {
+              Type vscale = set1_ps(1.0 / act_param.hard_swish_scale);
+              Type voffset = set1_ps(act_param.hard_swish_offset);
+              Type vthreshold = set1_ps(act_param.hard_swish_threshold);
+              r = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r, voffset))),
+                         mul_ps(r, vscale));
+            } else {
+              LOG(FATAL) << " [LoongArch] activation type "
+                         << static_cast<int>(act_type) << " not supported ";
+            }
+          }
+          storeu_ps(dout_block, r);
+          dout_block += block_channel;
+        }
+      }
+
+#ifdef __loongarch_asx
+      unpackC8_common(pack_out, dout_ptr, size_out_channel, real_block_channel);
+#else
+      unpackC4_common(pack_out, dout_ptr, size_out_channel, real_block_channel);
+#endif
+    }
+  }
+
+  TargetFree(TARGET(kLoongArch), pack_weight);
+  TargetFree(TARGET(kLoongArch), pack_input);
+  TargetFree(TARGET(kLoongArch), pack_out);
+}
+void conv_depthwise_5x5s2(const float* din,
+                          float* dout,
+                          int num,
+                          int ch_out,
+                          int h_out,
+                          int w_out,
+                          int ch_in,
+                          int h_in,
+                          int w_in,
+                          const float* weights,
+                          const float* bias,
+                          int pad,
+                          bool flag_bias,
+                          const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  auto act_type = act_param.active_type;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int in_len = block_channel * (2 * pad + w_in);
+
+  int channel_num = ROUNDUP(ch_in, block_channel);
+  float* pack_weight = static_cast<float*>(
+      TargetMalloc(TARGET(kLoongArch), channel_num * 5 * 5 * sizeof(float)));
+  float* pack_input = static_cast<float*>(TargetMalloc(
+      TARGET(kLoongArch),
+      (h_in + 2 * pad) * (w_in + 2 * pad) * block_channel * sizeof(float)));
+  float* pack_out = static_cast<float*>(TargetMalloc(
+      TARGET(kLoongArch), h_out * w_out * block_channel * sizeof(float)));
+
+#ifdef __loongarch_asx
+  packC8_common(weights, pack_weight, {0, 0, 0, 0}, 5, 5, ch_in);
+#else
+  packC4_common(weights, pack_weight, {0, 0, 0, 0}, 5, 5, ch_in);
+#endif
+
+  for (int n = 0; n < num; n++) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_out * size_out_channel;
+
+    for (int c = 0; c < ch_out; c += block_channel) {
+      int real_block_channel = Min(block_channel, ch_out - c);
+      auto* dout_ptr = dout_batch + c * size_out_channel;
+      auto* din_ptr = din_batch + c * size_in_channel;
+      auto* weights_data = pack_weight + c * 5 * 5;
+
+#ifdef __loongarch_asx
+      packC8_common(din_ptr,
+                    pack_input,
+                    {pad, pad, pad, pad},
+                    h_in,
+                    w_in,
+                    real_block_channel);
+#else
+      packC4_common(din_ptr,
+                    pack_input,
+                    {pad, pad, pad, pad},
+                    h_in,
+                    w_in,
+                    real_block_channel);
+#endif
+
+      float bias_ptr[block_channel] = {0.f};
+      if (flag_bias) {
+        for (int i = 0; i < block_channel; i++) {
+          if (real_block_channel > i) {
+            bias_ptr[i] = *(bias + c + i);
+          }
+        }
+      }
+
+      Type _bias = loadu_ps(bias_ptr);
+
+      for (int i = 0; i < h_out; i++) {
+        const float* block_inr0 = pack_input + i * 2 * in_len;
+        const float* block_inr1 = block_inr0 + in_len;
+        const float* block_inr2 = block_inr1 + in_len;
+        const float* block_inr3 = block_inr2 + in_len;
+        const float* block_inr4 = block_inr3 + in_len;
+        int j = 0;
+        float* dout_block = pack_out + i * w_out * block_channel;
+        for (; j + 3 < w_out; j += 4) {
+          Type i00 = loadu_ps(block_inr0);
+          Type i01 = loadu_ps(block_inr0 + 1 * block_channel);
+          Type i02 = loadu_ps(block_inr0 + 2 * block_channel);
+          Type i03 = loadu_ps(block_inr0 + 3 * block_channel);
+          Type i04 = loadu_ps(block_inr0 + 4 * block_channel);
+          Type i05 = loadu_ps(block_inr0 + 5 * block_channel);
+          Type i06 = loadu_ps(block_inr0 + 6 * block_channel);
+          Type i07 = loadu_ps(block_inr0 + 7 * block_channel);
+          Type i08 = loadu_ps(block_inr0 + 8 * block_channel);
+          Type i09 = loadu_ps(block_inr0 + 9 * block_channel);
+          Type i0a = loadu_ps(block_inr0 + 10 * block_channel);
+
+          Type w00 = loadu_ps(weights_data);
+          Type r0 = fmadd_ps(i00, w00, _bias);
+          Type r1 = fmadd_ps(i02, w00, _bias);
+          Type r2 = fmadd_ps(i04, w00, _bias);
+          Type r3 = fmadd_ps(i06, w00, _bias);
+
+          Type w01 = loadu_ps(weights_data + 1 * block_channel);
+          r0 = fmadd_ps(i01, w01, r0);
+          r1 = fmadd_ps(i03, w01, r1);
+          r2 = fmadd_ps(i05, w01, r2);
+          r3 = fmadd_ps(i07, w01, r3);
+
+          Type w02 = loadu_ps(weights_data + 2 * block_channel);
+          r0 = fmadd_ps(i02, w02, r0);
+          r1 = fmadd_ps(i04, w02, r1);
+          r2 = fmadd_ps(i06, w02, r2);
+          r3 = fmadd_ps(i08, w02, r3);
+
+          Type w03 = loadu_ps(weights_data + 3 * block_channel);
+          r0 = fmadd_ps(i03, w03, r0);
+          r1 = fmadd_ps(i05, w03, r1);
+          r2 = fmadd_ps(i07, w03, r2);
+          r3 = fmadd_ps(i09, w03, r3);
+
+          Type w04 = loadu_ps(weights_data + 4 * block_channel);
+          r0 = fmadd_ps(i04, w04, r0);
+          r1 = fmadd_ps(i06, w04, r1);
+          r2 = fmadd_ps(i08, w04, r2);
+          r3 = fmadd_ps(i0a, w04, r3);
+
+          Type i10 = loadu_ps(block_inr1);
+          Type i11 = loadu_ps(block_inr1 + 1 * block_channel);
+          Type i12 = loadu_ps(block_inr1 + 2 * block_channel);
+          Type i13 = loadu_ps(block_inr1 + 3 * block_channel);
+          Type i14 = loadu_ps(block_inr1 + 4 * block_channel);
+          Type i15 = loadu_ps(block_inr1 + 5 * block_channel);
+          Type i16 = loadu_ps(block_inr1 + 6 * block_channel);
+          Type i17 = loadu_ps(block_inr1 + 7 * block_channel);
+          Type i18 = loadu_ps(block_inr1 + 8 * block_channel);
+          Type i19 = loadu_ps(block_inr1 + 9 * block_channel);
+          Type i1a = loadu_ps(block_inr1 + 10 * block_channel);
+
+          Type w10 = loadu_ps(weights_data + 5 * block_channel);
+          r0 = fmadd_ps(i10, w10, r0);
+          r1 = fmadd_ps(i12, w10, r1);
+          r2 = fmadd_ps(i14, w10, r2);
+          r3 = fmadd_ps(i16, w10, r3);
+
+          Type w11 = loadu_ps(weights_data + 6 * block_channel);
+          r0 = fmadd_ps(i11, w11, r0);
+          r1 = fmadd_ps(i13, w11, r1);
+          r2 = fmadd_ps(i15, w11, r2);
+          r3 = fmadd_ps(i17, w11, r3);
+
+          Type w12 = loadu_ps(weights_data + 7 * block_channel);
+          r0 = fmadd_ps(i12, w12, r0);
+          r1 = fmadd_ps(i14, w12, r1);
+          r2 = fmadd_ps(i16, w12, r2);
+          r3 = fmadd_ps(i18, w12, r3);
+
+          Type w13 = loadu_ps(weights_data + 8 * block_channel);
+          r0 = fmadd_ps(i13, w13, r0);
+          r1 = fmadd_ps(i15, w13, r1);
+          r2 = fmadd_ps(i17, w13, r2);
+          r3 = fmadd_ps(i19, w13, r3);
+
+          Type w14 = loadu_ps(weights_data + 9 * block_channel);
+          r0 = fmadd_ps(i14, w14, r0);
+          r1 = fmadd_ps(i16, w14, r1);
+          r2 = fmadd_ps(i18, w14, r2);
+          r3 = fmadd_ps(i1a, w14, r3);
+
+          Type i20 = loadu_ps(block_inr2);
+          Type i21 = loadu_ps(block_inr2 + 1 * block_channel);
+          Type i22 = loadu_ps(block_inr2 + 2 * block_channel);
+          Type i23 = loadu_ps(block_inr2 + 3 * block_channel);
+          Type i24 = loadu_ps(block_inr2 + 4 * block_channel);
+          Type i25 = loadu_ps(block_inr2 + 5 * block_channel);
+          Type i26 = loadu_ps(block_inr2 + 6 * block_channel);
+          Type i27 = loadu_ps(block_inr2 + 7 * block_channel);
+          Type i28 = loadu_ps(block_inr2 + 8 * block_channel);
+          Type i29 = loadu_ps(block_inr2 + 9 * block_channel);
+          Type i2a = loadu_ps(block_inr2 + 10 * block_channel);
+
+          Type w20 = loadu_ps(weights_data + 10 * block_channel);
+          r0 = fmadd_ps(i20, w20, r0);
+          r1 = fmadd_ps(i22, w20, r1);
+          r2 = fmadd_ps(i24, w20, r2);
+          r3 = fmadd_ps(i26, w20, r3);
+
+          Type w21 = loadu_ps(weights_data + 11 * block_channel);
+          r0 = fmadd_ps(i21, w21, r0);
+          r1 = fmadd_ps(i23, w21, r1);
+          r2 = fmadd_ps(i25, w21, r2);
+          r3 = fmadd_ps(i27, w21, r3);
+
+          Type w22 = loadu_ps(weights_data + 12 * block_channel);
+          r0 = fmadd_ps(i22, w22, r0);
+          r1 = fmadd_ps(i24, w22, r1);
+          r2 = fmadd_ps(i26, w22, r2);
+          r3 = fmadd_ps(i28, w22, r3);
+
+          Type w23 = loadu_ps(weights_data + 13 * block_channel);
+          r0 = fmadd_ps(i23, w23, r0);
+          r1 = fmadd_ps(i25, w23, r1);
+          r2 = fmadd_ps(i27, w23, r2);
+          r3 = fmadd_ps(i29, w23, r3);
+
+          Type w24 = loadu_ps(weights_data + 14 * block_channel);
+          r0 = fmadd_ps(i24, w24, r0);
+          r1 = fmadd_ps(i26, w24, r1);
+          r2 = fmadd_ps(i28, w24, r2);
+          r3 = fmadd_ps(i2a, w24, r3);
+
+          Type i30 = loadu_ps(block_inr3);
+          Type i31 = loadu_ps(block_inr3 + 1 * block_channel);
+          Type i32 = loadu_ps(block_inr3 + 2 * block_channel);
+          Type i33 = loadu_ps(block_inr3 + 3 * block_channel);
+          Type i34 = loadu_ps(block_inr3 + 4 * block_channel);
+          Type i35 = loadu_ps(block_inr3 + 5 * block_channel);
+          Type i36 = loadu_ps(block_inr3 + 6 * block_channel);
+          Type i37 = loadu_ps(block_inr3 + 7 * block_channel);
+          Type i38 = loadu_ps(block_inr3 + 8 * block_channel);
+          Type i39 = loadu_ps(block_inr3 + 9 * block_channel);
+          Type i3a = loadu_ps(block_inr3 + 10 * block_channel);
+
+          Type w30 = loadu_ps(weights_data + 15 * block_channel);
+          r0 = fmadd_ps(i30, w30, r0);
+          r1 = fmadd_ps(i32, w30, r1);
+          r2 = fmadd_ps(i34, w30, r2);
+          r3 = fmadd_ps(i36, w30, r3);
+
+          Type w31 = loadu_ps(weights_data + 16 * block_channel);
+          r0 = fmadd_ps(i31, w31, r0);
+          r1 = fmadd_ps(i33, w31, r1);
+          r2 = fmadd_ps(i35, w31, r2);
+          r3 = fmadd_ps(i37, w31, r3);
+
+          Type w32 = loadu_ps(weights_data + 17 * block_channel);
+          r0 = fmadd_ps(i32, w32, r0);
+          r1 = fmadd_ps(i34, w32, r1);
+          r2 = fmadd_ps(i36, w32, r2);
+          r3 = fmadd_ps(i38, w32, r3);
+
+          Type w33 = loadu_ps(weights_data + 18 * block_channel);
+          r0 = fmadd_ps(i33, w33, r0);
+          r1 = fmadd_ps(i35, w33, r1);
+          r2 = fmadd_ps(i37, w33, r2);
+          r3 = fmadd_ps(i39, w33, r3);
+
+          Type w34 = loadu_ps(weights_data + 19 * block_channel);
+          r0 = fmadd_ps(i34, w34, r0);
+          r1 = fmadd_ps(i36, w34, r1);
+          r2 = fmadd_ps(i38, w34, r2);
+          r3 = fmadd_ps(i3a, w34, r3);
+
+          Type i40 = loadu_ps(block_inr4);
+          Type i41 = loadu_ps(block_inr4 + 1 * block_channel);
+          Type i42 = loadu_ps(block_inr4 + 2 * block_channel);
+          Type i43 = loadu_ps(block_inr4 + 3 * block_channel);
+          Type i44 = loadu_ps(block_inr4 + 4 * block_channel);
+          Type i45 = loadu_ps(block_inr4 + 5 * block_channel);
+          Type i46 = loadu_ps(block_inr4 + 6 * block_channel);
+          Type i47 = loadu_ps(block_inr4 + 7 * block_channel);
+          Type i48 = loadu_ps(block_inr4 + 8 * block_channel);
+          Type i49 = loadu_ps(block_inr4 + 9 * block_channel);
+          Type i4a = loadu_ps(block_inr4 + 10 * block_channel);
+
+          Type w40 = loadu_ps(weights_data + 20 * block_channel);
+          r0 = fmadd_ps(i40, w40, r0);
+          r1 = fmadd_ps(i42, w40, r1);
+          r2 = fmadd_ps(i44, w40, r2);
+          r3 = fmadd_ps(i46, w40, r3);
+
+          Type w41 = loadu_ps(weights_data + 21 * block_channel);
+          r0 = fmadd_ps(i41, w41, r0);
+          r1 = fmadd_ps(i43, w41, r1);
+          r2 = fmadd_ps(i45, w41, r2);
+          r3 = fmadd_ps(i47, w41, r3);
+
+          Type w42 = loadu_ps(weights_data + 22 * block_channel);
+          r0 = fmadd_ps(i42, w42, r0);
+          r1 = fmadd_ps(i44, w42, r1);
+          r2 = fmadd_ps(i46, w42, r2);
+          r3 = fmadd_ps(i48, w42, r3);
+
+          Type w43 = loadu_ps(weights_data + 23 * block_channel);
+          r0 = fmadd_ps(i43, w43, r0);
+          r1 = fmadd_ps(i45, w43, r1);
+          r2 = fmadd_ps(i47, w43, r2);
+          r3 = fmadd_ps(i49, w43, r3);
+
+          Type w44 = loadu_ps(weights_data + 24 * block_channel);
+          r0 = fmadd_ps(i44, w44, r0);
+          r1 = fmadd_ps(i46, w44, r1);
+          r2 = fmadd_ps(i48, w44, r2);
+          r3 = fmadd_ps(i4a, w44, r3);
+
+          Type zero = setzero_ps();
+          if (has_active) {  // process activation
+            if (act_type == lite_api::ActivationType::kRelu) {
+              r0 = max_ps(r0, zero);
+              r1 = max_ps(r1, zero);
+              r2 = max_ps(r2, zero);
+              r3 = max_ps(r3, zero);
+            } else if (act_type == lite_api::ActivationType::kRelu6) {
+              Type six = set1_ps(act_param.Relu_clipped_coef);
+              r0 = min_ps(max_ps(r0, zero), six);
+              r1 = min_ps(max_ps(r1, zero), six);
+              r2 = min_ps(max_ps(r2, zero), six);
+              r3 = min_ps(max_ps(r3, zero), six);
+            } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+              Type negative_slope = set1_ps(act_param.Leaky_relu_alpha);
+              r0 = blendv_ps(
+                  r0, mul_ps(negative_slope, r0), cmp_ps(r0, zero, 2));
+              r1 = blendv_ps(
+                  r1, mul_ps(negative_slope, r1), cmp_ps(r1, zero, 2));
+              r2 = blendv_ps(
+                  r2, mul_ps(negative_slope, r2), cmp_ps(r2, zero, 2));
+              r3 = blendv_ps(
+                  r3, mul_ps(negative_slope, r3), cmp_ps(r3, zero, 2));
+            } else if (act_type == lite_api::ActivationType::kHardSwish) {
+              Type vscale = set1_ps(1.0 / act_param.hard_swish_scale);
+              Type voffset = set1_ps(act_param.hard_swish_offset);
+              Type vthreshold = set1_ps(act_param.hard_swish_threshold);
+              r0 = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r0, voffset))),
+                          mul_ps(r0, vscale));
+              r1 = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r1, voffset))),
+                          mul_ps(r1, vscale));
+              r2 = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r2, voffset))),
+                          mul_ps(r2, vscale));
+              r3 = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r3, voffset))),
+                          mul_ps(r3, vscale));
+            } else {
+              LOG(FATAL) << " [LoongArch] activation type "
+                         << static_cast<int>(act_type) << " not supported ";
+            }
+          }
+
+          storeu_ps(dout_block, r0);
+          storeu_ps(dout_block + 1 * block_channel, r1);
+          storeu_ps(dout_block + 2 * block_channel, r2);
+          storeu_ps(dout_block + 3 * block_channel, r3);
+          dout_block += 4 * block_channel;
+
+          block_inr0 += 8 * block_channel;
+          block_inr1 = block_inr0 + in_len;
+          block_inr2 = block_inr1 + in_len;
+          block_inr3 = block_inr2 + in_len;
+          block_inr4 = block_inr3 + in_len;
+        }
+
+        for (; j < w_out; j++) {
+          Type r = _bias;
+          for (int m = 0; m < 5; m++) {
+            for (int n = 0; n < 5; n++) {
+              Type weight = loadu_ps(weights_data + 5 * block_channel * m +
+                                     block_channel * n);
+              Type input = loadu_ps(block_inr0 + block_channel * (j % 4) * 2 +
+                                    in_len * m + block_channel * n);
+              r = fmadd_ps(input, weight, r);
+            }
+          }
+          Type zero = setzero_ps();
+          if (has_active) {  // process activation
+            if (act_type == lite_api::ActivationType::kRelu) {
+              r = max_ps(r, zero);
+            } else if (act_type == lite_api::ActivationType::kRelu6) {
+              Type six = set1_ps(act_param.Relu_clipped_coef);
+              r = min_ps(max_ps(r, zero), six);
+            } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+              Type negative_slope = set1_ps(act_param.Leaky_relu_alpha);
+              r = blendv_ps(r, mul_ps(negative_slope, r), cmp_ps(r, zero, 2));
+            } else if (act_type == lite_api::ActivationType::kHardSwish) {
+              Type vscale = set1_ps(1.0 / act_param.hard_swish_scale);
+              Type voffset = set1_ps(act_param.hard_swish_offset);
+              Type vthreshold = set1_ps(act_param.hard_swish_threshold);
+              r = mul_ps(min_ps(vthreshold, max_ps(zero, add_ps(r, voffset))),
+                         mul_ps(r, vscale));
+            } else {
+              LOG(FATAL) << " [LoongArch] activation type "
+                         << static_cast<int>(act_type) << "not supported";
+            }
+          }
+          storeu_ps(dout_block, r);
+          dout_block += block_channel;
+        }
+      }
+
+#ifdef __loongarch_asx
+      unpackC8_common(pack_out, dout_ptr, size_out_channel, real_block_channel);
+#else
+      unpackC4_common(pack_out, dout_ptr, size_out_channel, real_block_channel);
+#endif
+    }
+  }
+
+  TargetFree(TARGET(kLoongArch), pack_weight);
+  TargetFree(TARGET(kLoongArch), pack_input);
+  TargetFree(TARGET(kLoongArch), pack_out);
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv_depthwise_5x5.h b/lite/backends/loongarch/math/conv_depthwise_5x5.h
new file mode 100644
index 00000000000..0987ecfb00c
--- /dev/null
+++ b/lite/backends/loongarch/math/conv_depthwise_5x5.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "lite/backends/loongarch/xxl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+#ifdef __loongarch_asx
+#define loadu_ps(a) lasx_loadu_f32(a)
+#define fmadd_ps(a, b, c) lasx_fmadd_f32(a, b, c)
+#define storeu_ps(a, b) lasx_storeu_f32(a, b)
+#define setzero_ps() lasx_setzero_f32()
+#define max_ps(a, b) lasx_max_f32(a, b)
+#define min_ps(a, b) lasx_min_f32(a, b)
+#define set1_ps(a) lasx_set1_f32(a)
+#define mul_ps(a, b) lasx_mul_f32(a, b)
+#define cmp_ps(a, b, c) lasx_cmp_f32(a, b, c)
+#define blendv_ps(a, b, c) lasx_blendv_f32(a, b, c)
+#define add_ps(a, b) lasx_add_f32(a, b)
+#define block_channel 8
+#define Type __m256
+#else
+#define loadu_ps(a) lsx_loadu_f32(a)
+#define storeu_ps(a, b) lsx_storeu_f32(a, b)
+#define fmadd_ps(a, b, c) lsx_add_f32(lsx_mul_f32(a, b), c)
+#define setzero_ps() lsx_setzero_f32()
+#define max_ps(a, b) lsx_max_f32(a, b)
+#define min_ps(a, b) lsx_min_f32(a, b)
+#define set1_ps(a) lsx_set1_f32(a)
+#define mul_ps(a, b) lsx_mul_f32(a, b)
+#define cmp_ps(a, b, c) lsx_cmp_f32(a, b, c)
+#define blendv_ps(a, b, c) lsx_blendv_f32(a, b, c)
+#define add_ps(a, b) lsx_add_f32(a, b)
+#define block_channel 4
+#define Type __m128
+#endif
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv_depthwise_impl.h b/lite/backends/loongarch/math/conv_depthwise_impl.h
new file mode 100644
index 00000000000..800cea1314d
--- /dev/null
+++ b/lite/backends/loongarch/math/conv_depthwise_impl.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+void conv_depthwise_3x3s1_p01_direct(
+    const float* din,
+    float* dout,
+    int num,
+    int ch_out,
+    int h_out,
+    int w_out,
+    int ch_in,
+    int h_in,
+    int w_in,
+    const float* weights,
+    const float* bias,
+    int pad,
+    bool flag_bias,
+    const operators::ActivationParam act_param);
+void conv_depthwise_3x3s2_p01_direct(
+    const float* din,
+    float* dout,
+    int num,
+    int ch_out,
+    int h_out,
+    int w_out,
+    int ch_in,
+    int h_in,
+    int w_in,
+    const float* weights,
+    const float* bias,
+    int pad,
+    bool flag_bias,
+    const operators::ActivationParam act_param);
+void conv_depthwise_5x5s1(const float* din,
+                          float* dout,
+                          int num,
+                          int ch_out,
+                          int h_out,
+                          int w_out,
+                          int ch_in,
+                          int h_in,
+                          int w_in,
+                          const float* weights,
+                          const float* bias,
+                          int pad,
+                          bool flag_bias,
+                          const operators::ActivationParam act_param);
+void conv_depthwise_5x5s2(const float* din,
+                          float* dout,
+                          int num,
+                          int ch_out,
+                          int h_out,
+                          int w_out,
+                          int ch_in,
+                          int h_in,
+                          int w_in,
+                          const float* weights,
+                          const float* bias,
+                          int pad,
+                          bool flag_bias,
+                          const operators::ActivationParam act_param);
+void conv_depthwise_3x3_pack(const operators::ConvParam& param,
+                             lite::Tensor* input_padding_,
+                             lite::Tensor* input_pack_,
+                             lite::Tensor* filter_pack_,
+                             lite::Tensor* output_pack_);
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/conv_depthwise_int8.h b/lite/backends/loongarch/math/conv_depthwise_int8.h
new file mode 100644
index 00000000000..79cf6df649a
--- /dev/null
+++ b/lite/backends/loongarch/math/conv_depthwise_int8.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/context.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename Dtype>
+void conv_3x3s1_dw_int8(Dtype* dout,
+                        const int8_t* din,
+                        const int8_t* weights,
+                        const float* bias,
+                        int num,
+                        int chin,
+                        int hin,
+                        int win,
+                        int hout,
+                        int wout,
+                        int pad_h,
+                        int pad_w,
+                        int flag_act,
+                        float alpha,
+                        const float* scale,
+                        LoongArchContext* ctx);
+
+template <typename Dtype>
+void conv_3x3s2p0_dw_int8(Dtype* dout,
+                          const int8_t* din,
+                          const int8_t* weights,
+                          const float* bias,
+                          int num,
+                          int chin,
+                          int hin,
+                          int win,
+                          int hout,
+                          int wout,
+                          int pad_h,
+                          int pad_w,
+                          int flag_act,
+                          float alpha,
+                          const float* scale,
+                          LoongArchContext* ctx);
+
+template <typename Dtype>
+void conv_3x3s2p1_dw_int8(Dtype* dout,
+                          const int8_t* din,
+                          const int8_t* weights,
+                          const float* bias,
+                          int num,
+                          int chin,
+                          int hin,
+                          int win,
+                          int hout,
+                          int wout,
+                          int pad_h,
+                          int pad_w,
+                          int flag_act,
+                          float alpha,
+                          const float* scale,
+                          LoongArchContext* ctx);
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/cos_sim_functor.cc b/lite/backends/loongarch/math/cos_sim_functor.cc
new file mode 100644
index 00000000000..d2a3771a8b2
--- /dev/null
+++ b/lite/backends/loongarch/math/cos_sim_functor.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/cos_sim_functor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+struct CosSimDyFunctor<lite::TargetType::kLoongArch, T> {
+  void operator()(const lite::LoongArchContext& ctx,
+                  const T* x_norm,
+                  const T* y_norm,
+                  const T* x,
+                  const T* y,
+                  const T* z,
+                  const T* dz,
+                  const size_t rows,
+                  const size_t cols,
+                  T* dy) const {
+    for (size_t row_id = 0; row_id < rows; ++row_id) {
+      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
+      auto dz_data = dz[row_id];
+      auto z_data = z[row_id];
+      auto* x_data = x + cols * row_id;
+      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+      auto y_norm_square = y_norm[0] * y_norm[0];
+      auto reciprocal_y_norm_square = 1 / y_norm_square;
+      for (size_t i = 0; i < cols; ++i) {
+        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                            z_data * y[i] * reciprocal_y_norm_square);
+      }
+    }
+  }
+};
+
+template struct CosSimDyFunctor<lite::TargetType::kLoongArch, float>;
+template struct CosSimDyFunctor<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/cos_sim_functor.h b/lite/backends/loongarch/math/cos_sim_functor.h
new file mode 100644
index 00000000000..12e12c54fa4
--- /dev/null
+++ b/lite/backends/loongarch/math/cos_sim_functor.h
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <stdlib.h>
+#include "lite/core/context.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T, bool same_row>
+struct CosSimFunctor {
+  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto* x = x_ + cols_ * row_id;
+    T xx = 0, xy = 0, yy = 0;
+    if (same_row) {
+      auto* y = y_ + cols_ * row_id;
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      y_norm_[row_id] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    } else {  // This can be wrote in a better way.
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y_[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      if (row_id == 0) y_norm_[0] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    }
+  }
+
+  T* x_norm_;
+  T* y_norm_;
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimGradFunctor {
+  CosSimGradFunctor(const T* x_norm,
+                    const T* y_norm,
+                    const T* x,
+                    const T* y,
+                    const T* z,
+                    const T* dz,
+                    T* dx,
+                    int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+
+    auto* dx = dx_ + cols_ * row_id;
+    auto* x = x_ + cols_ * row_id;
+    auto* y = y_ + cols_ * row_id;
+
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimDxFunctor {
+  CosSimDxFunctor(const T* x_norm,
+                  const T* y_norm,
+                  const T* x,
+                  const T* y,
+                  const T* z,
+                  const T* dz,
+                  T* dx,
+                  int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+    auto* x = x_ + cols_ * row_id;
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto* dx = dx_ + cols_ * row_id;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <lite::TargetType Target, typename T>
+struct CosSimDyFunctor {
+  void operator()(const lite::Context<Target>& ctx,
+                  const T* x_norm,
+                  const T* y_norm,
+                  const T* x,
+                  const T* y,
+                  const T* z,
+                  const T* dz,
+                  const size_t rows,
+                  const size_t cols,
+                  T* dy) const;
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/cpu_vec.h b/lite/backends/loongarch/math/cpu_vec.h
new file mode 100644
index 00000000000..b3793e4e234
--- /dev/null
+++ b/lite/backends/loongarch/math/cpu_vec.h
@@ -0,0 +1,529 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <functional>
+#include <string>
+#include "lite/backends/loongarch/cpu_info.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+
+#define LASX_FLOAT_BLOCK 8
+
+template <typename T>
+inline void vec_exp(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+inline void vec_scal(const int n, const T a, T* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_scal(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+
+template <>
+inline void vec_scal<float, lite::loongarch::lasx>(const int n,
+                                            const float a,
+                                            const float* x,
+                                            float* y) {
+#ifdef __loongarch_asx
+  constexpr int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_scal<float, lite::loongarch::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 scalar = lasx_set1_f32(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP               \
+  tmp = lasx_loadu_f32(x + i);     \
+  tmp = lasx_mul_f32(tmp, scalar); \
+  lasx_storeu_f32(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+#else
+  vec_scal<float, lite::loongarch::isa_any>(n, a, x, y);
+#endif
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_sum(const size_t n, const T* x, T* s) {
+  s[0] = x[0];
+  for (size_t i = 1; i < n; ++i) {
+    s[0] += x[i];
+  }
+}
+
+template <>
+inline void vec_sum<float, lite::loongarch::lasx>(const size_t n,
+                                           const float* x,
+                                           float* s) {
+#ifdef __loongarch_asx
+  constexpr unsigned int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sum<float, lite::loongarch::isa_any>(n, x, s);
+    return;
+  }
+
+  unsigned int i, end;
+  i = end = 0;
+  s[0] = 0.f;
+
+  end = n & ~(block - 1);
+  __m256 tmp = lasx_setzero_f32();
+  for (i = 0; i < end; i += block) {
+    tmp = lasx_add_f32(tmp, lasx_load_f32(x + i));
+  }
+
+  __m256 hsum = lasx_hadd_f32(tmp, tmp);
+  hsum = lasx_add_f32(hsum, lasx_permute2f128_f32(hsum, hsum, 0x1));
+  lsx_store_1f32(
+      s,
+      lsx_hadd_f32(lasx_castm256_m128(hsum), lasx_castm256_m128(hsum)));
+
+  for (; i < n; i++) {
+    s[0] += x[i];
+  }
+#else
+  vec_sum<float, lite::loongarch::isa_any>(n, x, s);
+#endif
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
+  for (size_t i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <>
+inline void vec_mul<float, lite::loongarch::lasx>(const size_t n,
+                                           const float* x,
+                                           const float* y,
+                                           float* z) {
+#ifdef __loongarch_asx
+  constexpr unsigned int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_mul<float, lite::loongarch::isa_any>(n, x, y, z);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  end = n & ~(block - 1);
+  for (i = 0; i < end; i += block) {
+    lasx_storeu_f32(
+        z + i, lasx_mul_f32(lasx_loadu_f32(x + i), lasx_loadu_f32(y + i)));
+  }
+
+  for (; i < n; i++) {
+    z[i] = x[i] * y[i];
+  }
+#else
+  vec_mul<float, lite::loongarch::isa_any>(n, x, y, z);
+#endif
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
+  z[0] = x[0] * y[0];
+  for (size_t i = 1; i < n; ++i) {
+    z[0] += x[i] * y[i];
+  }
+}
+
+template <>
+inline void vec_mul_reduce<float, lite::loongarch::lasx>(const size_t n,
+                                                  const float* x,
+                                                  const float* y,
+                                                  float* z) {
+#ifdef __loongarch_asx
+  constexpr unsigned int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_mul_reduce<float, lite::loongarch::isa_any>(n, x, y, z);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  z[0] = 0.f;
+
+  end = n & ~(block - 1);
+  __m256 tmp = lasx_setzero_f32();
+  for (i = 0; i < end; i += block) {
+    tmp = lasx_add_f32(
+        tmp, lasx_mul_f32(lasx_loadu_f32(x + i), lasx_loadu_f32(y + i)));
+  }
+
+  __m256 hsum = lasx_hadd_f32(tmp, tmp);
+  hsum = lasx_add_f32(hsum, lasx_permute2f128_f32(hsum, hsum, 0x1));
+  lsx_store_1f32(
+      z,
+      lsx_hadd_f32(lasx_castm256_m128(hsum), lasx_castm256_m128(hsum)));
+
+  for (; i < n; i++) {
+    z[0] += x[i] * y[i];
+  }
+#else
+  vec_mul_reduce<float, lite::loongarch::isa_any>(n, x, y, z);
+#endif
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+}
+
+template <>
+inline void vec_bias_sub<float, lite::loongarch::lasx>(const int n,
+                                                const float a,
+                                                const float* x,
+                                                float* y) {
+#ifdef __loongarch_asx
+  constexpr int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_bias_sub<float, lite::loongarch::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = lasx_set1_f32(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = lasx_loadu_f32(x + i);   \
+  tmp = lasx_sub_f32(bias, tmp); \
+  lasx_storeu_f32(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+#else
+  vec_bias_sub<float, lite::loongarch::isa_any>(n, a, x, y);
+#endif
+}
+
+// out = x*y + (1-x)*z
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, lite::loongarch::lasx>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+#ifdef __loongarch_asx
+  constexpr int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, lite::loongarch::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = lasx_set1_f32(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = lasx_loadu_f32(x + i);
+    tmpy = lasx_loadu_f32(y + i);
+    tmpz = lasx_loadu_f32(z + i);
+    tmpy = lasx_mul_f32(tmpx, tmpy);
+    tmpx = lasx_sub_f32(bias, tmpx);
+    tmpz = lasx_mul_f32(tmpx, tmpz);
+    tmpz = lasx_add_f32(tmpy, tmpz);
+    lasx_storeu_f32(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, lite::loongarch::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
+  for (size_t i = 0; i < n; ++i) {
+    y[i] = x[i] < a ? a : x[i];
+  }
+}
+
+template <>
+inline void vec_clip<float, lite::loongarch::lasx>(const size_t n,
+                                            const float a,
+                                            const float* x,
+                                            float* y) {
+#ifdef __loongarch_asx
+  constexpr unsigned int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_clip<float, lite::loongarch::isa_any>(n, a, x, y);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  end = n & ~(block - 1);
+  __m256 threshold = lasx_set1_f32(a);
+
+  for (i = 0; i < end; i += block) {
+    lasx_storeu_f32(y + i, lasx_max_f32(lasx_loadu_f32(x + i), threshold));
+  }
+
+  for (; i < n; i++) {
+    y[i] = x[i] < a ? a : x[i];
+  }
+#else
+  vec_clip<float, lite::loongarch::isa_any>(n, a, x, y);
+#endif
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, lite::loongarch::lasx>(const int n,
+                                                const float a,
+                                                const float* x,
+                                                float* y) {
+#ifdef __loongarch_asx
+  constexpr int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, lite::loongarch::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = lasx_set1_f32(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = lasx_loadu_f32(x + i);   \
+  tmp = lasx_add_f32(tmp, bias); \
+  lasx_storeu_f32(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, lite::loongarch::isa_any>(n, a, x, y);
+#endif
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_identity(const int n, const T* x, T* y) {
+  // do nothing
+  return;
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  vec_exp<T>(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+template <>
+inline void vec_sigmoid<float, lite::loongarch::lasx>(const int n,
+                                               const float* x,
+                                               float* y) {
+#ifdef __loongarch_asx
+  constexpr int block = LASX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sigmoid<float, lite::loongarch::isa_any>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 max = lasx_set1_f32(SIGMOID_THRESHOLD_MAX);
+  __m256 min = lasx_set1_f32(SIGMOID_THRESHOLD_MIN);
+  __m256 zeros = lasx_setzero_f32();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = lasx_loadu_f32(x + i);    \
+  tmp = lasx_max_f32(tmp, min);   \
+  tmp = lasx_min_f32(tmp, max);   \
+  tmp = lasx_sub_f32(zeros, tmp); \
+  lasx_storeu_f32(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest != 0) {
+    // can not continue move step since the src and dst address could be equal
+    const float xmin = SIGMOID_THRESHOLD_MIN;
+    const float xmax = SIGMOID_THRESHOLD_MAX;
+    for (i = n - rest; i < n; ++i) {
+      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
+    }
+  }
+
+  vec_exp<float>(n, y, y);
+
+  __m256 ones = lasx_set1_f32(1.0f);
+#define MOVE_ONE_STEP             \
+  tmp = lasx_loadu_f32(y + i);   \
+  tmp = lasx_add_f32(ones, tmp); \
+  tmp = lasx_div_f32(ones, tmp); \
+  lasx_storeu_f32(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step
+  for (i = n - rest; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+#else
+  vec_sigmoid<float, lite::loongarch::isa_any>(n, x, y);
+#endif
+}
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
+  vec_sigmoid<T, isa>(n, y, y);
+  vec_scal<T>(n, static_cast<T>(2), y);
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
+}
+
+// TODO(TJ): make relu clip
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, lite::loongarch::lasx>(const int n,
+                                            const float* x,
+                                            float* y) {
+#ifdef __loongarch_asx
+  constexpr int block = LASX_FLOAT_BLOCK;
+  if (n < block * 4) {
+    vec_relu<float, lite::loongarch::isa_any>(n, x, y);
+    return;
+  }
+
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 zeros = lasx_setzero_f32();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = lasx_loadu_f32(x + i);    \
+  tmp = lasx_max_f32(tmp, zeros); \
+  lasx_storeu_f32(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+
+#else
+  vec_relu<float, lite::loongarch::isa_any>(n, x, y);
+#endif
+}
+
+// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
+
+template <typename T, lite::loongarch::cpu_isa_t isa = lite::loongarch::isa_any>
+class VecActivations {
+ public:
+  std::function<void(const int, const T*, T*)> operator()(
+      const std::string& type) {
+    if (type == "sigmoid") {
+      return vec_sigmoid<T, isa>;
+    } else if (type == "relu") {
+      return vec_relu<T, isa>;
+    } else if (type == "tanh") {
+      return vec_tanh<T, isa>;
+    } else if (type == "identity" || type == "") {
+      return vec_identity<T, isa>;
+    }
+    LOG(FATAL) << "Not support type: " << type;
+  }
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/cross_entropy.cc b/lite/backends/loongarch/math/cross_entropy.cc
new file mode 100644
index 00000000000..71d4e34eb97
--- /dev/null
+++ b/lite/backends/loongarch/math/cross_entropy.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/cross_entropy.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class CrossEntropyFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& ctx,
+                  lite::Tensor* out,
+                  const lite::Tensor* prob,
+                  const lite::Tensor* labels,
+                  const bool softLabel,
+                  const int ignore_index,
+                  const int axis_dim) {
+    const int batch_size = prob->dims()[0];
+    const int num_classes = prob->dims()[1];
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    if (softLabel) {
+      auto in = EigenMatrix<T>::From(*prob);
+      auto lbl = EigenMatrix<T>::From(*labels);
+      auto loss = EigenMatrix<T>::From(*out);
+
+      loss.device(lite::fluid::EigenDeviceType<lite::TargetType::kLoongArch>()) =
+          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
+                .reshape(batch_axis_remain)
+                .sum(Eigen::DSizes<int, 1>(1)));
+    } else {
+      const T* prob_data = prob->template data<T>();
+      T* loss_data = out->template mutable_data<T>();
+
+      const int64_t* label_data = labels->data<int64_t>();
+      for (int i = 0; i < batch_size; ++i) {
+        for (int j = 0; j < num_remain; j++) {
+          int lbl = label_data[i * num_remain + j];
+          CHECK((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
+          int index = i * num_classes + lbl * num_remain + j;
+          int loss_idx = i * num_remain + j;
+          loss_data[loss_idx] =
+              lbl == ignore_index
+                  ? 0
+                  : -math::TolerableValue<T>()(std::log(prob_data[index]));
+        }
+      }
+    }
+  }
+};
+
+template class CrossEntropyFunctor<lite::TargetType::kLoongArch, float>;
+template class CrossEntropyFunctor<lite::TargetType::kLoongArch, double>;
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/cross_entropy.h b/lite/backends/loongarch/math/cross_entropy.h
new file mode 100644
index 00000000000..3f03f024684
--- /dev/null
+++ b/lite/backends/loongarch/math/cross_entropy.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <limits>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    CHECK(static_cast<bool>(std::is_floating_point<T>::value));
+    const T kApproInf = 1e20;
+
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+// NOTE(dzh): float16 value clip behave different.
+// 1. Our ValueClipping has a  hardcore threshold 1e20
+// for float number. 1e20 will resulting in overflow in float16.
+// 2. float16 should expose the the real number overflow to python.
+// because mixed-training depends the inf/nan value to determine
+// if the scale value will be adjusted.
+// Also. In standard implementation of cross entropy, other
+// framework not has the ValueClipping.
+template <>
+struct TolerableValue<lite::fluid::float16> {
+  HOSTDEVICE lite::fluid::float16 operator()(
+      const lite::fluid::float16& x) const {
+    if (lite::fluid::isfinite(x))
+      return x;
+    else if (x > static_cast<lite::fluid::float16>(0))
+      return std::numeric_limits<lite::fluid::float16>::max();
+    else
+      return std::numeric_limits<lite::fluid::float16>::min();
+  }
+};
+
+template <lite::TargetType Target, typename T>
+class CrossEntropyFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  lite::Tensor* out,
+                  const lite::Tensor* prob,
+                  const lite::Tensor* labels,
+                  const bool softLabel,
+                  const int ignore_index,
+                  const int axis_dim);
+};
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/elementwise.h b/lite/backends/loongarch/math/elementwise.h
new file mode 100644
index 00000000000..28164eb036c
--- /dev/null
+++ b/lite/backends/loongarch/math/elementwise.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include "lite/backends/loongarch/math/elementwise_common_broadcast_config.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+#define ElementWiseFunc(op)                                                    \
+  template <typename T>                                                        \
+  void Elementwise_##op(const T* dinx,                                         \
+                        const T* diny,                                         \
+                        T* dout,                                               \
+                        int num,                                               \
+                        bool has_active,                                       \
+                        std::string act_type) {                                \
+    if (act_type == "tanh") {                                                  \
+      lite::loongarch::math::elementwise_range_to_range<                             \
+          MergeConfig<op##Config<T>, ActiveConfig<ActiveType::TANH, T>>>(      \
+          dinx, diny, dout, num);                                              \
+    } else if (act_type == "relu") {                                           \
+      lite::loongarch::math::elementwise_range_to_range<                             \
+          MergeConfig<op##Config<T>, ActiveConfig<ActiveType::RELU, T>>>(      \
+          dinx, diny, dout, num);                                              \
+    } else if (act_type == "sigmoid") {                                        \
+      lite::loongarch::math::elementwise_range_to_range<                             \
+          MergeConfig<op##Config<T>, ActiveConfig<ActiveType::SIGMOID, T>>>(   \
+          dinx, diny, dout, num);                                              \
+    } else {                                                                   \
+      lite::loongarch::math::elementwise_range_to_range<                             \
+          MergeConfig<op##Config<T>, ActiveConfig<ActiveType::NO_ACTIVE, T>>>( \
+          dinx, diny, dout, num);                                              \
+    }                                                                          \
+  }
+
+#define ElementWiseFuncBCast(op)                                      \
+  template <typename T>                                               \
+  void Elementwise_Broadcast_##op(const T* dinx,                      \
+                                  const T* diny,                      \
+                                  T* dout,                            \
+                                  int batch,                          \
+                                  int channels,                       \
+                                  int num,                            \
+                                  bool has_active,                    \
+                                  std::string act_type,               \
+                                  bool inv) {                         \
+    if (act_type == "tanh") {                                         \
+      for (int i = 0; i < batch; ++i) {                               \
+        for (int j = 0; j < channels; ++j) {                          \
+          int offset = (i * channels + j) * num;                      \
+          auto* dout_ptr = dout + offset;                             \
+          if (inv) {                                                  \
+            const auto* dinx_ptr = dinx + j;                          \
+            const auto* diny_ptr = diny + offset;                     \
+            lite::loongarch::math::elementwise_one_to_range<                \
+                MergeConfig<op##Config<T>,                            \
+                            ActiveConfig<ActiveType::TANH, T>>>(      \
+                dinx_ptr, diny_ptr, dout_ptr, num);                   \
+          } else {                                                    \
+            const auto* dinx_ptr = dinx + offset;                     \
+            const auto* diny_ptr = diny + j;                          \
+            lite::loongarch::math::elementwise_range_to_one<                \
+                MergeConfig<op##Config<T>,                            \
+                            ActiveConfig<ActiveType::TANH, T>>>(      \
+                dinx_ptr, diny_ptr, dout_ptr, num);                   \
+          }                                                           \
+        }                                                             \
+      }                                                               \
+    } else if (act_type == "relu") {                                  \
+      for (int i = 0; i < batch; ++i) {                               \
+        for (int j = 0; j < channels; ++j) {                          \
+          int offset = (i * channels + j) * num;                      \
+          auto* dout_ptr = dout + offset;                             \
+          if (inv) {                                                  \
+            const auto* dinx_ptr = dinx + j;                          \
+            const auto* diny_ptr = diny + offset;                     \
+            lite::loongarch::math::elementwise_one_to_range<                \
+                MergeConfig<op##Config<T>,                            \
+                            ActiveConfig<ActiveType::RELU, T>>>(      \
+                dinx_ptr, diny_ptr, dout_ptr, num);                   \
+          } else {                                                    \
+            const auto* dinx_ptr = dinx + offset;                     \
+            const auto* diny_ptr = diny + j;                          \
+            lite::loongarch::math::elementwise_range_to_one<                \
+                MergeConfig<op##Config<T>,                            \
+                            ActiveConfig<ActiveType::RELU, T>>>(      \
+                dinx_ptr, diny_ptr, dout_ptr, num);                   \
+          }                                                           \
+        }                                                             \
+      }                                                               \
+    } else if (act_type == "sigmoid") {                               \
+      for (int i = 0; i < batch; ++i) {                               \
+        for (int j = 0; j < channels; ++j) {                          \
+          int offset = (i * channels + j) * num;                      \
+          auto* dout_ptr = dout + offset;                             \
+          if (inv) {                                                  \
+            const auto* dinx_ptr = dinx + j;                          \
+            const auto* diny_ptr = diny + offset;                     \
+            lite::loongarch::math::elementwise_one_to_range<                \
+                MergeConfig<op##Config<T>,                            \
+                            ActiveConfig<ActiveType::SIGMOID, T>>>(   \
+                dinx_ptr, diny_ptr, dout_ptr, num);                   \
+          } else {                                                    \
+            const auto* dinx_ptr = dinx + offset;                     \
+            const auto* diny_ptr = diny + j;                          \
+            lite::loongarch::math::elementwise_range_to_one<                \
+                MergeConfig<op##Config<T>,                            \
+                            ActiveConfig<ActiveType::SIGMOID, T>>>(   \
+                dinx_ptr, diny_ptr, dout_ptr, num);                   \
+          }                                                           \
+        }                                                             \
+      }                                                               \
+    } else {                                                          \
+      for (int i = 0; i < batch; ++i) {                               \
+        for (int j = 0; j < channels; ++j) {                          \
+          int offset = (i * channels + j) * num;                      \
+          auto* dout_ptr = dout + offset;                             \
+          if (inv) {                                                  \
+            const auto* dinx_ptr = dinx + j;                          \
+            const auto* diny_ptr = diny + offset;                     \
+            lite::loongarch::math::elementwise_one_to_range<                \
+                MergeConfig<op##Config<T>,                            \
+                            ActiveConfig<ActiveType::NO_ACTIVE, T>>>( \
+                dinx_ptr, diny_ptr, dout_ptr, num);                   \
+          } else {                                                    \
+            const auto* dinx_ptr = dinx + offset;                     \
+            const auto* diny_ptr = diny + j;                          \
+            lite::loongarch::math::elementwise_range_to_one<                \
+                MergeConfig<op##Config<T>,                            \
+                            ActiveConfig<ActiveType::NO_ACTIVE, T>>>( \
+                dinx_ptr, diny_ptr, dout_ptr, num);                   \
+          }                                                           \
+        }                                                             \
+      }                                                               \
+    }                                                                 \
+  }
+
+// clang-format off
+ElementWiseFunc(Add)
+ElementWiseFuncBCast(Add)
+ElementWiseFunc(Sub)
+ElementWiseFuncBCast(Sub)
+ElementWiseFunc(Mul)
+ElementWiseFuncBCast(Mul)
+ElementWiseFunc(Max)
+ElementWiseFuncBCast(Max)
+ElementWiseFunc(Min)
+ElementWiseFuncBCast(Min)
+ElementWiseFunc(Div)
+ElementWiseFuncBCast(Div)
+ElementWiseFunc(FloorDiv)
+ElementWiseFuncBCast(FloorDiv)
+ElementWiseFunc(Mod)
+ElementWiseFuncBCast(Mod)
+ElementWiseFunc(Pow)
+ElementWiseFuncBCast(Pow)
+// clang-format on
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/elementwise_common_broadcast_config.h b/lite/backends/loongarch/math/elementwise_common_broadcast_config.h
new file mode 100644
index 00000000000..de31df7aa3c
--- /dev/null
+++ b/lite/backends/loongarch/math/elementwise_common_broadcast_config.h
@@ -0,0 +1,964 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include "lite/backends/loongarch/math/activation_functions.h"
+#include "lite/backends/loongarch/xxl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+//****************************** isa op *******************************
+template <typename OUT_T, typename IN_T>
+OUT_T loadu_ps_inline(const IN_T* a);
+
+template <typename OUT_T, typename IN_T>
+void storeu_ps_inline(IN_T* b, OUT_T a);
+
+template <typename OUT_T, typename IN_T>
+OUT_T set1_ps_inline(IN_T a);
+
+template <typename T>
+T add_ps_inline(T a, T b);
+
+template <typename T>
+T sub_ps_inline(T a, T b);
+
+template <typename T>
+T max_ps_inline(T a, T b);
+
+template <typename T>
+T min_ps_inline(T a, T b);
+
+template <typename T>
+T div_ps_inline(T a, T b);
+
+template <typename T>
+T mul_ps_inline(T a, T b);
+
+template <typename OUT_T, typename IN_T>
+OUT_T set1_epi32_inline(IN_T a);
+
+template <typename OUT_T, typename IN_T>
+OUT_T set1_epi64x_inline(IN_T a);
+
+template <typename OUT_T, typename IN_T>
+OUT_T loadu_si_inline(const IN_T* a);
+
+template <typename OUT_T, typename IN_T>
+void storeu_si_inline(IN_T* b, OUT_T a);
+
+template <typename T>
+T add_epi32_inline(T a, T b);
+
+template <typename T>
+T add_epi64_inline(T a, T b);
+
+template <typename T>
+T sub_epi32_inline(T a, T b);
+
+template <typename T>
+T sub_epi64_inline(T a, T b);
+
+template <typename T>
+T mul_epi32_inline(T a, T b);
+
+template <typename T>
+T max_epi32_inline(T a, T b);
+
+template <typename T>
+T min_epi32_inline(T a, T b);
+
+// compiler can't recognize intrinsics function name
+#if defined(__loongarch_asx)
+template <>
+__m256 loadu_ps_inline<__m256, float>(const float* a) {
+  return lasx_loadu_f32(a);
+}
+template <>
+void storeu_ps_inline<__m256, float>(float* b, __m256 a) {
+  lasx_storeu_f32(b, a);
+}
+template <>
+__m256 set1_ps_inline<__m256, float>(float a) {
+  return lasx_set1_f32(a);
+}
+template <>
+__m256 add_ps_inline<__m256>(__m256 a, __m256 b) {
+  return lasx_add_f32(a, b);
+}
+template <>
+__m256 sub_ps_inline<__m256>(__m256 a, __m256 b) {
+  return lasx_sub_f32(a, b);
+}
+template <>
+__m256 max_ps_inline<__m256>(__m256 a, __m256 b) {
+  return lasx_max_f32(a, b);
+}
+template <>
+__m256 min_ps_inline<__m256>(__m256 a, __m256 b) {
+  return lasx_min_f32(a, b);
+}
+template <>
+__m256 div_ps_inline<__m256>(__m256 a, __m256 b) {
+  return lasx_div_f32(a, b);
+}
+template <>
+__m256 mul_ps_inline<__m256>(__m256 a, __m256 b) {
+  return lasx_mul_f32(a, b);
+}
+#elif defined(__loongarch_sx)
+template <>
+__m128 loadu_ps_inline<__m128, float>(const float* a) {
+  return lsx_loadu_f32(a);
+}
+template <>
+void storeu_ps_inline<__m128, float>(float* b, __m128 a) {
+  lsx_storeu_f32(b, a);
+}
+template <>
+__m128 set1_ps_inline<__m128, float>(float a) {
+  return lsx_set1_f32(a);
+}
+template <>
+__m128 add_ps_inline<__m128>(__m128 a, __m128 b) {
+  return lsx_add_f32(a, b);
+}
+template <>
+__m128 sub_ps_inline<__m128>(__m128 a, __m128 b) {
+  return lsx_sub_f32(a, b);
+}
+template <>
+__m128 max_ps_inline<__m128>(__m128 a, __m128 b) {
+  return lsx_max_f32(a, b);
+}
+template <>
+__m128 min_ps_inline<__m128>(__m128 a, __m128 b) {
+  return lsx_min_f32(a, b);
+}
+template <>
+__m128 div_ps_inline<__m128>(__m128 a, __m128 b) {
+  return lsx_div_f32(a, b);
+}
+template <>
+__m128 mul_ps_inline<__m128>(__m128 a, __m128 b) {
+  return lsx_mul_f32(a, b);
+}
+
+__m128 lsx_relu_ps(const __m128 a) {
+  __m128 vec_zero = lsx_set1_f32(0.f);
+  return lsx_max_f32(a, vec_zero);
+}
+#endif
+
+#if defined(__loongarch_asx)
+template <>
+__m256i loadu_si_inline<__m256i, __m256i>(const __m256i* a) {
+  return lasx_loadu_m256i(a);
+}
+template <>
+void storeu_si_inline<__m256i, __m256i>(__m256i* b, __m256i a) {
+  lasx_storeu_m256i(b, a);
+}
+template <>
+__m256i set1_epi32_inline<__m256i, int>(int a) {
+  return lasx_set1_i32(a);
+}
+template <>
+__m256i set1_epi64x_inline<__m256i, int64_t>(int64_t a) {
+  return lasx_set1_i64x(a);
+}
+template <>
+__m256i add_epi32_inline<__m256i>(__m256i a, __m256i b) {
+  return lasx_add_i32(a, b);
+}
+template <>
+__m256i add_epi64_inline<__m256i>(__m256i a, __m256i b) {
+  return lasx_add_i64(a, b);
+}
+template <>
+__m256i sub_epi32_inline<__m256i>(__m256i a, __m256i b) {
+  return lasx_sub_i32(a, b);
+}
+template <>
+__m256i sub_epi64_inline<__m256i>(__m256i a, __m256i b) {
+  return lasx_sub_i64(a, b);
+}
+template <>
+__m256i mul_epi32_inline<__m256i>(__m256i a, __m256i b) {
+  return lasx_mullo_i32(a, b);
+}
+template <>
+__m256i max_epi32_inline<__m256i>(__m256i a, __m256i b) {
+  return lasx_max_i32(a, b);
+}
+template <>
+__m256i min_epi32_inline<__m256i>(__m256i a, __m256i b) {
+  return lasx_min_i32(a, b);
+}
+#elif defined(__loongarch_sx)
+template <>
+__m128i loadu_si_inline<__m128i, __m128i>(const __m128i* a) {
+  return lsx_loadu_m128i(a);
+}
+template <>
+void storeu_si_inline<__m128i, __m128i>(__m128i* b, __m128i a) {
+  lsx_storeu_m128i(b, a);
+}
+template <>
+__m128i set1_epi32_inline<__m128i, int>(int a) {
+  return lsx_set1_i32(a);
+}
+template <>
+__m128i set1_epi64x_inline<__m128i, int64_t>(int64_t a) {
+  return lsx_set1_i64x(a);
+}
+template <>
+__m128i add_epi32_inline<__m128i>(__m128i a, __m128i b) {
+  return lsx_add_i32(a, b);
+}
+template <>
+__m128i add_epi64_inline<__m128i>(__m128i a, __m128i b) {
+  return lsx_add_i64(a, b);
+}
+template <>
+__m128i sub_epi32_inline<__m128i>(__m128i a, __m128i b) {
+  return lsx_sub_i32(a, b);
+}
+template <>
+__m128i sub_epi64_inline<__m128i>(__m128i a, __m128i b) {
+  return lsx_sub_i64(a, b);
+}
+template <>
+__m128i mul_epi32_inline<__m128i>(__m128i a, __m128i b) {
+  return lsx_mullo_i32(a, b);
+}
+template <>
+__m128i max_epi32_inline<__m128i>(__m128i a, __m128i b) {
+  return lsx_max_i32(a, b);
+}
+template <>
+__m128i min_epi32_inline<__m128i>(__m128i a, __m128i b) {
+  return lsx_min_i32(a, b);
+}
+#endif
+
+//****************************** naive op *******************************
+template <class T>
+inline T NaiveRelu(T a) {
+  return a > 0 ? a : 0;
+}
+
+template <class T>
+inline T NaiveTanh(T a) {
+  float x = expf(a);
+  float y = expf(-a);
+  return (x - y) / (x + y);
+}
+
+template <class T>
+inline T NaiveSigmoid(T a) {
+  const T min = -40.0;  // SIGMOID_THRESHOLD_MIN;
+  const T max = 13.0;   // SIGMOID_THRESHOLD_MAX;
+  T tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<T>(1.0) / (static_cast<T>(1.0) + std::exp(-tmp));
+}
+
+template <typename T>
+inline T NaiveAdd(T l, T r) {
+  return l + r;
+}
+
+template <typename T>
+inline T NaiveSub(T l, T r) {
+  return l - r;
+}
+
+template <typename T>
+inline T NaiveMul(T l, T r) {
+  return l * r;
+}
+
+template <typename T>
+inline T NaiveDiv(T l, T r) {
+  return l / r;
+}
+
+template <typename T>
+inline T NaiveFloorDiv(T l, T r) {
+  return static_cast<T>(std::trunc(l / r));
+}
+
+template <typename T>
+inline T NaiveMax(T l, T r) {
+  return l > r ? l : r;
+}
+
+template <typename T>
+inline T NaiveMin(T l, T r) {
+  return l < r ? l : r;
+}
+
+template <typename T>
+inline T NaiveMod(T l, T r) {
+  T res = l % r;
+  if ((res != 0) && ((res < 0) != (r < 0))) res += r;
+  return res;
+}
+
+template <typename T>
+inline T NaivePow(T l, T r) {
+  return std::pow(l, r);
+}
+
+//*************************** Config Struct *****************************
+struct NullCpuInstruction {};
+
+template <class ComputeConfig, class ActConfig>
+struct MergeConfig : public ComputeConfig, public ActConfig {};
+
+enum class ActiveType { NO_ACTIVE, RELU, TANH, SIGMOID };
+
+template <class DataType>
+struct BasicConfig {};
+
+template <ActiveType, class DataType>
+struct ActiveConfig {};
+
+template <class T>
+struct AddConfig {};
+
+template <class T>
+struct SubConfig {};
+
+template <class T>
+struct MulConfig {};
+
+template <class T>
+struct MaxConfig {};
+
+template <class T>
+struct MinConfig {};
+
+template <class T>
+struct DivConfig {};
+
+template <class T>
+struct FloorDivConfig {};
+
+template <class T>
+struct ModConfig {};
+
+template <class T>
+struct PowConfig {};
+
+//***************************** float Config *********************
+template <>
+struct BasicConfig<float> {
+#if defined(__loongarch_asx)
+  using T = float;
+  using ISA_T = __m256;
+  using LD_T = float;  // using for load and store
+#elif defined(__loongarch_sx)
+  using T = float;
+  using ISA_T = __m128;
+  using LD_T = float;
+#endif
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_dup)(
+      typename BasicConfig<float>::T){set1_ps_inline<ISA_T, T>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_ld)(
+      const typename BasicConfig<float>::LD_T*){
+      loadu_ps_inline<ISA_T, LD_T>};  // NOLINT
+  constexpr static void (*isa_str)(typename BasicConfig<float>::LD_T*,
+                                   typename BasicConfig<float>::ISA_T){
+      storeu_ps_inline<ISA_T, LD_T>};  // NOLINT
+};
+
+template <>
+struct AddConfig<float> : public BasicConfig<float> {
+  constexpr static typename BasicConfig<float>::T (*naive_op)(
+      typename BasicConfig<float>::T,
+      typename BasicConfig<float>::T){NaiveAdd<float>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_op)(
+      typename BasicConfig<float>::ISA_T, typename BasicConfig<float>::ISA_T){
+      add_ps_inline<typename BasicConfig<float>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct SubConfig<float> : public BasicConfig<float> {
+  constexpr static typename BasicConfig<float>::T (*naive_op)(
+      typename BasicConfig<float>::T,
+      typename BasicConfig<float>::T){NaiveSub<float>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_op)(
+      typename BasicConfig<float>::ISA_T, typename BasicConfig<float>::ISA_T){
+      sub_ps_inline<typename BasicConfig<float>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct MulConfig<float> : public BasicConfig<float> {
+  constexpr static typename BasicConfig<float>::T (*naive_op)(
+      typename BasicConfig<float>::T,
+      typename BasicConfig<float>::T){NaiveMul<float>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_op)(
+      typename BasicConfig<float>::ISA_T, typename BasicConfig<float>::ISA_T){
+      mul_ps_inline<typename BasicConfig<float>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct MaxConfig<float> : public BasicConfig<float> {
+  constexpr static typename BasicConfig<float>::T (*naive_op)(
+      typename BasicConfig<float>::T,
+      typename BasicConfig<float>::T){NaiveMax<float>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_op)(
+      typename BasicConfig<float>::ISA_T, typename BasicConfig<float>::ISA_T){
+      max_ps_inline<typename BasicConfig<float>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct MinConfig<float> : public BasicConfig<float> {
+  constexpr static typename BasicConfig<float>::T (*naive_op)(
+      typename BasicConfig<float>::T,
+      typename BasicConfig<float>::T){NaiveMin<float>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_op)(
+      typename BasicConfig<float>::ISA_T, typename BasicConfig<float>::ISA_T){
+      min_ps_inline<typename BasicConfig<float>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct DivConfig<float> : public BasicConfig<float> {
+  constexpr static typename BasicConfig<float>::T (*naive_op)(
+      typename BasicConfig<float>::T,
+      typename BasicConfig<float>::T){NaiveDiv<float>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_op)(
+      typename BasicConfig<float>::ISA_T, typename BasicConfig<float>::ISA_T){
+      div_ps_inline<typename BasicConfig<float>::ISA_T>};  // NOLINT
+};
+
+//************************** in32,int64 ******************
+template <>
+struct BasicConfig<int32_t> {
+#if defined(__loongarch_asx)
+  using T = int32_t;
+  using ISA_T = __m256i;
+  using LD_T = __m256i;
+#elif defined(__loongarch_sx)
+  using T = int32_t;
+  using ISA_T = __m128i;
+  using LD_T = __m128i;
+#endif
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_dup)(
+      typename BasicConfig<int32_t>::T){set1_epi32_inline<ISA_T, T>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_ld)(
+      const typename BasicConfig<int32_t>::LD_T*){
+      loadu_si_inline<ISA_T, LD_T>};  // NOLINT
+  constexpr static void (*isa_str)(typename BasicConfig<int32_t>::LD_T*,
+                                   typename BasicConfig<int32_t>::ISA_T){
+      storeu_si_inline<ISA_T, LD_T>};  // NOLINT
+};
+
+template <>
+struct BasicConfig<int64_t> {
+#if defined(__loongarch_asx)
+  using T = int64_t;
+  using ISA_T = __m256i;
+  using LD_T = __m256i;
+#elif defined(__loongarch_sx)
+  using T = int64_t;
+  using ISA_T = __m128i;
+  using LD_T = __m128i;
+#endif
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_dup)(
+      typename BasicConfig<int64_t>::T){
+      set1_epi64x_inline<ISA_T, T>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_ld)(
+      const typename BasicConfig<int64_t>::LD_T*){
+      loadu_si_inline<ISA_T, LD_T>};  // NOLINT
+  constexpr static void (*isa_str)(typename BasicConfig<int64_t>::LD_T*,
+                                   typename BasicConfig<int64_t>::ISA_T){
+      storeu_si_inline<ISA_T, LD_T>};  // NOLINT
+};
+
+template <>
+struct AddConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaiveAdd<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){
+      add_epi32_inline<typename BasicConfig<int32_t>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct AddConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaiveAdd<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){
+      add_epi64_inline<typename BasicConfig<int64_t>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct SubConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaiveSub<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){
+      sub_epi32_inline<typename BasicConfig<int32_t>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct SubConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaiveSub<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){
+      sub_epi64_inline<typename BasicConfig<int64_t>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct MulConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaiveMul<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){
+      mul_epi32_inline<typename BasicConfig<int32_t>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct MulConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaiveMul<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+};
+
+template <>
+struct MaxConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaiveMax<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){
+      max_epi32_inline<typename BasicConfig<int32_t>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct MaxConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaiveMax<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+};
+
+template <>
+struct MinConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaiveMin<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){
+      min_epi32_inline<typename BasicConfig<int32_t>::ISA_T>};  // NOLINT
+};
+
+template <>
+struct MinConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaiveMin<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+};
+
+// mod has no isa version and float version
+template <>
+struct ModConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaiveMod<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){nullptr};  // NOLINT
+};
+
+template <>
+struct ModConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaiveMod<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+};
+
+// div except float has no isa version
+template <>
+struct DivConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaiveDiv<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){nullptr};  // NOLINT
+};
+
+template <>
+struct DivConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaiveDiv<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+};
+
+// floordiv has no isa version
+template <>
+struct FloorDivConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaiveFloorDiv<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){nullptr};  // NOLINT
+};
+
+template <>
+struct FloorDivConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaiveFloorDiv<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+};
+
+template <>
+struct FloorDivConfig<float> : public BasicConfig<float> {
+  constexpr static typename BasicConfig<float>::T (*naive_op)(
+      typename BasicConfig<float>::T,
+      typename BasicConfig<float>::T){NaiveFloorDiv<float>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_op)(
+      typename BasicConfig<float>::ISA_T,
+      typename BasicConfig<float>::ISA_T){nullptr};  // NOLINT
+};
+
+// pow has no isa version
+template <>
+struct PowConfig<int32_t> : public BasicConfig<int32_t> {
+  constexpr static typename BasicConfig<int32_t>::T (*naive_op)(
+      typename BasicConfig<int32_t>::T,
+      typename BasicConfig<int32_t>::T){NaivePow<int32_t>};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int32_t>::ISA_T,
+      typename BasicConfig<int32_t>::ISA_T){nullptr};  // NOLINT
+};
+
+template <>
+struct PowConfig<int64_t> : public BasicConfig<int64_t> {
+  constexpr static typename BasicConfig<int64_t>::T (*naive_op)(
+      typename BasicConfig<int64_t>::T,
+      typename BasicConfig<int64_t>::T){NaivePow<int64_t>};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_op)(
+      typename BasicConfig<int64_t>::ISA_T,
+      typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+};
+
+template <>
+struct PowConfig<float> : public BasicConfig<float> {
+  constexpr static typename BasicConfig<float>::T (*naive_op)(
+      typename BasicConfig<float>::T,
+      typename BasicConfig<float>::T){NaivePow<float>};  // NOLINT
+  constexpr static typename BasicConfig<float>::ISA_T (*isa_op)(
+      typename BasicConfig<float>::ISA_T,
+      typename BasicConfig<float>::ISA_T){nullptr};  // NOLINT
+};
+
+// Active only support float version
+template <class DataType>
+struct ActiveConfig<ActiveType::NO_ACTIVE, DataType> {
+  constexpr static DataType (*naive_active)(DataType){nullptr};  // NOLINT
+  constexpr static typename BasicConfig<DataType>::ISA_T (*isa_active)(
+      const typename BasicConfig<DataType>::ISA_T){nullptr};  // NOLINT
+  constexpr static bool has_active{false};                    // NOLINT
+};
+
+#if defined(__loongarch_asx)
+namespace forward_lasx = paddle::lite::loongarch::math::detail::forward::lasx;
+
+template <>
+struct ActiveConfig<ActiveType::RELU, float> {
+  constexpr static float (*naive_active)(float){NaiveRelu<float>};  // NOLINT
+  constexpr static __m256 (*isa_active)(const __m256){
+      forward_lasx::Relu};                 // NOLINT
+  constexpr static bool has_active{true};  // NOLINT
+};
+
+template <>
+struct ActiveConfig<ActiveType::TANH, float> {
+  constexpr static float (*naive_active)(float){NaiveTanh<float>};  // NOLINT
+  constexpr static __m256 (*isa_active)(const __m256){
+      forward_lasx::Tanh};                 // NOLINT
+  constexpr static bool has_active{true};  // NOLINT
+};
+
+template <>
+struct ActiveConfig<ActiveType::SIGMOID, float> {
+  constexpr static float (*naive_active)(float){NaiveSigmoid<float>};  // NOLINT
+  constexpr static __m256 (*isa_active)(const __m256){
+      forward_lasx::Sigmoid};              // NOLINT
+  constexpr static bool has_active{true};  // NOLINT
+};
+#elif defined(__loongarch_sx)
+extern __m128 lsx_relu_ps(const __m128 a);
+
+template <>
+struct ActiveConfig<ActiveType::RELU, float> {
+  constexpr static float (*naive_active)(float){NaiveRelu<float>};   // NOLINT
+  constexpr static __m128 (*isa_active)(const __m128){lsx_relu_ps};  // NOLINT
+  constexpr static bool has_active{true};                            // NOLINT
+};
+
+// LSX has no tanh and sigmoid for now
+template <>
+struct ActiveConfig<ActiveType::TANH, float> {
+  constexpr static float (*naive_active)(float){NaiveTanh<float>};  // NOLINT
+  constexpr static __m128 (*isa_active)(const __m128){nullptr};     // NOLINT
+  constexpr static bool has_active{true};                           // NOLINT
+};
+
+template <>
+struct ActiveConfig<ActiveType::SIGMOID, float> {
+  constexpr static float (*naive_active)(float){NaiveSigmoid<float>};  // NOLINT
+  constexpr static __m128 (*isa_active)(const __m128){nullptr};        // NOLINT
+  constexpr static bool has_active{true};                              // NOLINT
+};
+#endif
+
+// fuse-activation doesn't support int32 and int64 type
+template <>
+struct ActiveConfig<ActiveType::RELU, int32_t> {
+  constexpr static int32_t (*naive_active)(int32_t){nullptr};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_active)(
+      const typename BasicConfig<int32_t>::ISA_T){nullptr};  // NOLINT
+  constexpr static bool has_active{false};                   // NOLINT
+};
+
+template <>
+struct ActiveConfig<ActiveType::TANH, int32_t> {
+  constexpr static int32_t (*naive_active)(int32_t){nullptr};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_active)(
+      const typename BasicConfig<int32_t>::ISA_T){nullptr};  // NOLINT
+  constexpr static bool has_active{false};                   // NOLINT
+};
+
+template <>
+struct ActiveConfig<ActiveType::SIGMOID, int32_t> {
+  constexpr static int32_t (*naive_active)(int32_t){nullptr};  // NOLINT
+  constexpr static typename BasicConfig<int32_t>::ISA_T (*isa_active)(
+      const typename BasicConfig<int32_t>::ISA_T){nullptr};  // NOLINT
+  constexpr static bool has_active{false};                   // NOLINT
+};
+
+template <>
+struct ActiveConfig<ActiveType::RELU, int64_t> {
+  constexpr static int64_t (*naive_active)(int64_t){nullptr};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_active)(
+      const typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+  constexpr static bool has_active{false};                   // NOLINT
+};
+
+template <>
+struct ActiveConfig<ActiveType::TANH, int64_t> {
+  constexpr static int64_t (*naive_active)(int64_t){nullptr};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_active)(
+      const typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+  constexpr static bool has_active{false};                   // NOLINT
+};
+
+template <>
+struct ActiveConfig<ActiveType::SIGMOID, int64_t> {
+  constexpr static int64_t (*naive_active)(int64_t){nullptr};  // NOLINT
+  constexpr static typename BasicConfig<int64_t>::ISA_T (*isa_active)(
+      const typename BasicConfig<int64_t>::ISA_T){nullptr};  // NOLINT
+  constexpr static bool has_active{false};                   // NOLINT
+};
+
+// avoid compling error: xxx_address will never be null
+static bool condition_one(void* isa_op, void* naive_op) {
+  return ((isa_op != nullptr) && (naive_op != nullptr));
+}
+
+static bool condition_two(void* isa_op, void* naive_op) {
+  return ((isa_op == nullptr) && (naive_op != nullptr));
+}
+
+static bool condition_three(void* isa_act) { return (isa_act != nullptr); }
+
+// Fuse-Activation only supports relu, sigmoid and tanh for LASX instruction,
+// relu for LSX instruction, the others run naive functions instead.
+template <class Config, bool IS_X_SINGLE, bool IS_Y_SINGLE>
+void do_isa_elementwise(const typename Config::T* dinx,
+                        const typename Config::T* diny,
+                        typename Config::T* dout,
+                        int num) {
+  static_assert((IS_X_SINGLE && IS_Y_SINGLE) != true,
+                "X and Y could not be both single");
+  using T = typename Config::T;
+  using ISA_T = typename Config::ISA_T;
+  using LD_T = typename Config::LD_T;
+  constexpr auto isa_dup = Config::isa_dup;
+  constexpr auto isa_ld = Config::isa_ld;
+  constexpr auto isa_st = Config::isa_str;
+  constexpr auto isa_op = Config::isa_op;
+  constexpr auto naive_op = Config::naive_op;
+  constexpr auto isa_act = Config::isa_active;
+  constexpr auto naive_active = Config::naive_active;
+  constexpr auto has_active = Config::has_active;
+  constexpr int element_num = sizeof(ISA_T) / sizeof(T);
+  int cnt = num / element_num;
+  int remain = num % element_num;
+
+  auto dinx_ptr = dinx;
+  auto diny_ptr = diny;
+  auto dout_ptr = dout;
+
+  // avoid compiling error
+  bool condition1 = condition_one(reinterpret_cast<void*>(isa_op),
+                                  reinterpret_cast<void*>(naive_op));
+  bool condition2 = condition_two(reinterpret_cast<void*>(isa_op),
+                                  reinterpret_cast<void*>(naive_op));
+  bool condition3 = condition_three(reinterpret_cast<void*>(isa_act));
+
+  if (condition1) {
+    ISA_T rbx, rby;
+    if (IS_X_SINGLE) {
+      rbx = isa_dup(*dinx);
+    }
+    if (IS_Y_SINGLE) {
+      rby = isa_dup(*diny);
+    }
+
+    for (int i = 0; i < cnt; i++) {
+      ISA_T dinx0, diny0, doutz0;
+      if (!IS_X_SINGLE) {
+        dinx0 = isa_ld(reinterpret_cast<const LD_T*>(dinx_ptr));
+        dinx_ptr += element_num;
+      }
+      if (!IS_Y_SINGLE) {
+        diny0 = isa_ld(reinterpret_cast<const LD_T*>(diny_ptr));
+        diny_ptr += element_num;
+      }
+      if (IS_X_SINGLE && !IS_Y_SINGLE) {
+        doutz0 = isa_op(rbx, diny0);
+      } else if (!IS_X_SINGLE && IS_Y_SINGLE) {
+        doutz0 = isa_op(dinx0, rby);
+      } else if (!IS_X_SINGLE && !IS_Y_SINGLE) {
+        doutz0 = isa_op(dinx0, diny0);
+      }
+
+      if (has_active && condition3) {
+        doutz0 = isa_act(doutz0);
+      } else if (has_active) {
+        T* tmp_data = reinterpret_cast<T*>(&doutz0);
+        for (int ii = 0; ii < element_num; ii++) {
+          tmp_data[ii] = naive_active(tmp_data[ii]);
+        }
+      }
+      isa_st(reinterpret_cast<LD_T*>(dout_ptr), doutz0);
+      dout_ptr += element_num;
+    }
+    if (remain > 0) {
+      for (int p = 0; p < remain; p++) {
+        auto tmp = naive_op(*dinx_ptr, *diny_ptr);
+        if (has_active) {
+          tmp = naive_active(tmp);
+        }
+        *dout_ptr = tmp;
+        dout_ptr++;
+        if (!IS_X_SINGLE) {
+          dinx_ptr++;
+        }
+        if (!IS_Y_SINGLE) {
+          diny_ptr++;
+        }
+      }
+    }
+  } else if (condition2) {
+    for (int p = 0; p < num; p++) {
+      auto tmp = naive_op(*dinx_ptr, *diny_ptr);
+      if (has_active) {
+        tmp = naive_active(tmp);
+      }
+      *dout_ptr = tmp;
+      dout_ptr++;
+      if (!IS_X_SINGLE) {
+        dinx_ptr++;
+      }
+      if (!IS_Y_SINGLE) {
+        diny_ptr++;
+      }
+    }
+  } else {
+    LOG(FATAL) << "do_isa_elementwise has no op function to call.";
+  }
+}
+
+template <class Config>
+void elementwise_one_to_range(const typename Config::T* dinx,
+                              const typename Config::T* diny,
+                              typename Config::T* dout,
+                              int num) {
+  do_isa_elementwise<Config, true, false>(dinx, diny, dout, num);
+}
+
+template <class Config>
+void elementwise_range_to_one(const typename Config::T* dinx,
+                              const typename Config::T* diny,
+                              typename Config::T* dout,
+                              int num) {
+  do_isa_elementwise<Config, false, true>(dinx, diny, dout, num);
+}
+
+template <class Config>
+void elementwise_range_to_range(const typename Config::T* dinx,
+                                const typename Config::T* diny,
+                                typename Config::T* dout,
+                                int num) {
+  do_isa_elementwise<Config, false, false>(dinx, diny, dout, num);
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/fill_bias_activate.cc b/lite/backends/loongarch/math/fill_bias_activate.cc
new file mode 100644
index 00000000000..9aabdd2b88c
--- /dev/null
+++ b/lite/backends/loongarch/math/fill_bias_activate.cc
@@ -0,0 +1,534 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/fill_bias_activate.h"
+#include <string.h>
+#include <algorithm>
+#include "lite/core/op_registry.h"
+#include "lite/backends/loongarch/xxl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+static void activate_relu_inplace(float *data, int len, float alpha, int mode) {
+  int i = 0;
+
+  if (0 == mode) {  // relu
+#ifdef __loongarch_asx
+    __m256 vec_zero = lasx_set1_f32(0.f);
+    for (; i + 7 < len; i += 8) {
+      __m256 vec_data = lasx_loadu_f32(data + i);
+      lasx_storeu_f32(data + i, lasx_max_f32(vec_data, vec_zero));
+    }
+#endif
+#ifdef __loongarch_sx
+    __m128 vec_zero_128 = lsx_set1_f32(0.f);
+    for (; i + 3 < len; i += 4) {
+      __m128 vec_data_128 = lsx_loadu_f32(data + i);
+      lsx_storeu_f32(data + i, lsx_max_f32(vec_data_128, vec_zero_128));
+    }
+#endif
+    for (; i < len; i++) {
+      data[i] = data[i] > 0.f ? data[i] : 0.f;
+    }
+  } else {  // relu6
+#ifdef __loongarch_asx
+    __m256 vec_zero = lasx_set1_f32(0.f);
+    __m256 vec_alph = lasx_set1_f32(alpha);
+    for (; i + 7 < len; i += 8) {
+      __m256 vec_data = lasx_loadu_f32(data + i);
+      lasx_storeu_f32(
+          data + i, lasx_min_f32(lasx_max_f32(vec_data, vec_zero), vec_alph));
+    }
+#endif
+#ifdef __loongarch_sx
+    __m128 vec_zero_128 = lsx_set1_f32(0.f);
+    __m128 vec_alph_128 = lsx_set1_f32(alpha);
+    for (; i + 3 < len; i += 4) {
+      __m128 vec_data_128 = lsx_loadu_f32(data + i);
+      lsx_storeu_f32(
+          data + i,
+          lsx_min_f32(lsx_max_f32(vec_data_128, vec_zero_128), vec_alph_128));
+    }
+#endif
+    for (; i < len; i++) {
+      data[i] = data[i] > 0.f ? data[i] : 0.f;
+      data[i] = data[i] < alpha ? data[i] : alpha;
+    }
+  }
+}
+
+static void activate_relu_inplace_bias(float *data,
+                                       const float *bias,
+                                       int channel,
+                                       int channel_size,
+                                       float alpha,
+                                       int mode) {
+  int i = 0;
+  int j = 0;
+  float *tmp_data = data;
+
+#ifdef __loongarch_asx
+  __m256 vec_zero = {0.f};
+  __m256 vec_bias = {0.f};
+  __m256 vec_data = {0.f};
+  __m256 vec_alph = lasx_set1_f32(alpha);
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_zero_128 = {0.f};
+  __m128 vec_bias_128 = {0.f};
+  __m128 vec_data_128 = {0.f};
+  __m128 vec_alph_128 = lsx_set1_f32(alpha);
+#endif
+
+  if (0 == mode) {  // relu
+    for (j = 0; j < channel; j++) {
+      i = 0;
+      tmp_data = data + j * channel_size;
+#ifdef __loongarch_asx
+      vec_bias = lasx_set1_f32(bias[j]);
+      for (; i + 7 < channel_size; i += 8) {
+        vec_data = lasx_loadu_f32(tmp_data + i);
+        vec_data = lasx_add_f32(vec_bias, vec_data);
+        lasx_storeu_f32(tmp_data + i, lasx_max_f32(vec_data, vec_zero));
+      }
+#endif
+#ifdef __loongarch_sx
+      vec_bias_128 = lsx_set1_f32(bias[j]);
+      for (; i + 3 < channel_size; i += 4) {
+        vec_data_128 = lsx_loadu_f32(tmp_data + i);
+        vec_data_128 = lsx_add_f32(vec_data_128, vec_bias_128);
+        lsx_storeu_f32(tmp_data + i, lsx_max_f32(vec_data_128, vec_zero_128));
+      }
+#endif
+      for (; i < channel_size; i++) {
+        tmp_data[i] += bias[j];
+        tmp_data[i] = tmp_data[i] > 0.f ? tmp_data[i] : 0.f;
+      }
+    }
+  } else {  // relu6
+    for (j = 0; j < channel; j++) {
+      i = 0;
+      tmp_data = data + j * channel_size;
+#ifdef __loongarch_asx
+      vec_bias = lasx_set1_f32(bias[j]);
+      for (; i + 7 < channel_size; i += 8) {
+        vec_data = lasx_loadu_f32(tmp_data + i);
+        vec_data = lasx_add_f32(vec_bias, vec_data);
+        lasx_storeu_f32(
+            tmp_data + i,
+            lasx_min_f32(lasx_max_f32(vec_data, vec_zero), vec_alph));
+      }
+#endif
+#ifdef __loongarch_sx
+      vec_bias_128 = lsx_set1_f32(bias[j]);
+      for (; i + 3 < channel_size; i += 4) {
+        vec_data_128 = lsx_loadu_f32(tmp_data + i);
+        vec_data_128 = lsx_add_f32(vec_data_128, vec_bias_128);
+        lsx_storeu_f32(
+            tmp_data + i,
+            lsx_min_f32(lsx_max_f32(vec_data_128, vec_zero_128), vec_alph_128));
+      }
+#endif
+      for (; i < channel_size; i++) {
+        tmp_data[i] += bias[j];
+        tmp_data[i] = tmp_data[i] > 0.f ? tmp_data[i] : 0.f;
+        tmp_data[i] = tmp_data[i] < alpha ? tmp_data[i] : alpha;
+      }
+    }
+  }
+}
+
+static void activate_lrelu_inplace(float *data, int len, float alpha) {
+  const int cmp_le_os = 2;
+  int i = 0;
+
+#ifdef __loongarch_asx
+  __m256 vec_zero = lasx_set1_f32(0.f);
+  __m256 vec_alph = lasx_set1_f32(alpha);
+  for (; i + 7 < len; i += 8) {
+    __m256 vec_data = lasx_loadu_f32(data + i);
+    __m256 vec_lr = lasx_mul_f32(vec_alph, vec_data);
+    __m256 vec_mask = lasx_cmp_f32(vec_data, vec_zero, cmp_le_os);
+    lasx_storeu_f32(data + i, lasx_blendv_f32(vec_data, vec_lr, vec_mask));
+  }
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+  __m128 vec_alph_128 = lsx_set1_f32(alpha);
+  for (; i + 3 < len; i += 4) {
+    __m128 vec_data_128 = lsx_loadu_f32(data + i);
+    __m128 vec_lr_128 = lsx_mul_f32(vec_data_128, vec_alph_128);
+    __m128 vec_mask_128 = lsx_cmple_f32(vec_data_128, vec_zero_128);
+    lsx_storeu_f32(data + i,
+                  lsx_blendv_f32(vec_data_128, vec_lr_128, vec_mask_128));
+  }
+#endif
+  for (; i < len; i++) {
+    data[i] = data[i] > 0.f ? data[i] : alpha * data[i];
+  }
+}
+
+static void activate_lrelu_inplace_bias(float *data,
+                                        const float *bias,
+                                        int channel,
+                                        int channel_size,
+                                        float alpha) {
+  const int cmp_le_os = 2;
+  int i = 0;
+  int j = 0;
+  float *tmp_data = data;
+
+#ifdef __loongarch_asx
+  __m256 vec_zero = lasx_set1_f32(0.f);
+  __m256 vec_alph = lasx_set1_f32(alpha);
+  __m256 vec_bias = {0.f};
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+  __m128 vec_alph_128 = lsx_set1_f32(alpha);
+  __m128 vec_bias_128 = {0.f};
+#endif
+
+  for (j = 0; j < channel; j++) {
+    i = 0;
+    tmp_data = data + j * channel_size;
+
+#ifdef __loongarch_asx
+    vec_bias = lasx_set1_f32(bias[j]);
+    for (; i + 7 < channel_size; i += 8) {
+      __m256 vec_data = lasx_add_f32(vec_bias, lasx_loadu_f32(tmp_data + i));
+      __m256 vec_lr = lasx_mul_f32(vec_alph, vec_data);
+      __m256 vec_mask = lasx_cmp_f32(vec_data, vec_zero, cmp_le_os);
+      lasx_storeu_f32(tmp_data + i,
+                       lasx_blendv_f32(vec_data, vec_lr, vec_mask));
+    }
+#endif
+#ifdef __loongarch_sx
+    vec_bias_128 = lsx_set1_f32(bias[j]);
+    for (; i + 3 < channel_size; i += 4) {
+      __m128 vec_data_128 =
+          lsx_add_f32(vec_bias_128, lsx_loadu_f32(tmp_data + i));
+      __m128 vec_lr_128 = lsx_mul_f32(vec_data_128, vec_alph_128);
+      __m128 vec_mask_128 = lsx_cmple_f32(vec_data_128, vec_zero_128);
+      lsx_storeu_f32(tmp_data + i,
+                    lsx_blendv_f32(vec_data_128, vec_lr_128, vec_mask_128));
+    }
+#endif
+    for (; i < channel_size; i++) {
+      tmp_data[i] += bias[j];
+      tmp_data[i] = tmp_data[i] > 0.f ? tmp_data[i] : alpha * tmp_data[i];
+    }
+  }
+}
+
+static void activate_hardswish_inplace_bias(float *data,
+                                            const float *bias,
+                                            int channel,
+                                            int channel_size,
+                                            float scale,
+                                            float threshold,
+                                            float offset) {
+#ifdef __loongarch_asx
+  int cnt = channel_size >> 5;
+  int remain = channel_size & 31;
+  __m256 vec_zero = lasx_set1_f32(0.f);
+  __m256 vec_scale = lasx_set1_f32(1.0 / scale);
+  __m256 vec_threshold = lasx_set1_f32(threshold);
+  __m256 vec_offset = lasx_set1_f32(offset);
+#else
+  int cnt = channel_size >> 4;
+  int remain = channel_size & 15;
+#endif
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+  __m128 vec_scale_128 = lsx_set1_f32(1.0 / scale);
+  __m128 vec_threshold_128 = lsx_set1_f32(threshold);
+  __m128 vec_offset_128 = lsx_set1_f32(offset);
+  int cnt_4 = remain >> 2;
+  int rem_4 = remain & 3;
+  for (int i = 0; i < channel; i++) {
+#ifdef __loongarch_asx
+    __m256 vec_bias = lasx_set1_f32(bias[i]);
+#endif
+    __m128 vec_bias_128 = lsx_set1_f32(bias[i]);
+    float *tmp_data = data + i * channel_size;
+
+    for (int j = 0; j < cnt; j++) {
+#ifdef __loongarch_asx
+      __m256 vin0 = lasx_add_f32(lasx_loadu_f32(tmp_data), vec_bias);
+      __m256 vin1 = lasx_add_f32(lasx_loadu_f32(tmp_data + 8), vec_bias);
+      __m256 vin2 = lasx_add_f32(lasx_loadu_f32(tmp_data + 16), vec_bias);
+      __m256 vin3 = lasx_add_f32(lasx_loadu_f32(tmp_data + 24), vec_bias);
+      __m256 vadd0 = lasx_add_f32(vin0, vec_offset);
+      __m256 vadd1 = lasx_add_f32(vin1, vec_offset);
+      __m256 vadd2 = lasx_add_f32(vin2, vec_offset);
+      __m256 vadd3 = lasx_add_f32(vin3, vec_offset);
+      __m256 vsum0 = lasx_mul_f32(vin0, vec_scale);
+      __m256 vsum1 = lasx_mul_f32(vin1, vec_scale);
+      __m256 vsum2 = lasx_mul_f32(vin2, vec_scale);
+      __m256 vsum3 = lasx_mul_f32(vin3, vec_scale);
+      __m256 vres0 =
+          lasx_min_f32(lasx_max_f32(vadd0, vec_zero), vec_threshold);
+      __m256 vres1 =
+          lasx_min_f32(lasx_max_f32(vadd1, vec_zero), vec_threshold);
+      __m256 vres2 =
+          lasx_min_f32(lasx_max_f32(vadd2, vec_zero), vec_threshold);
+      __m256 vres3 =
+          lasx_min_f32(lasx_max_f32(vadd3, vec_zero), vec_threshold);
+      lasx_storeu_f32(tmp_data, lasx_mul_f32(vres0, vsum0));
+      lasx_storeu_f32(tmp_data + 8, lasx_mul_f32(vres1, vsum1));
+      lasx_storeu_f32(tmp_data + 16, lasx_mul_f32(vres2, vsum2));
+      lasx_storeu_f32(tmp_data + 24, lasx_mul_f32(vres3, vsum3));
+      tmp_data += 32;
+#else
+      __m128 vin0 = lsx_add_f32(lsx_loadu_f32(tmp_data), vec_bias_128);
+      __m128 vin1 = lsx_add_f32(lsx_loadu_f32(tmp_data + 4), vec_bias_128);
+      __m128 vin2 = lsx_add_f32(lsx_loadu_f32(tmp_data + 8), vec_bias_128);
+      __m128 vin3 = lsx_add_f32(lsx_loadu_f32(tmp_data + 12), vec_bias_128);
+      __m128 vadd0 = lsx_add_f32(vin0, vec_offset_128);
+      __m128 vadd1 = lsx_add_f32(vin1, vec_offset_128);
+      __m128 vadd2 = lsx_add_f32(vin2, vec_offset_128);
+      __m128 vadd3 = lsx_add_f32(vin3, vec_offset_128);
+      __m128 vsum0 = lsx_mul_f32(vin0, vec_scale_128);
+      __m128 vsum1 = lsx_mul_f32(vin1, vec_scale_128);
+      __m128 vsum2 = lsx_mul_f32(vin2, vec_scale_128);
+      __m128 vsum3 = lsx_mul_f32(vin3, vec_scale_128);
+      __m128 vres0 =
+          lsx_min_f32(lsx_max_f32(vadd0, vec_zero_128), vec_threshold_128);
+      __m128 vres1 =
+          lsx_min_f32(lsx_max_f32(vadd1, vec_zero_128), vec_threshold_128);
+      __m128 vres2 =
+          lsx_min_f32(lsx_max_f32(vadd2, vec_zero_128), vec_threshold_128);
+      __m128 vres3 =
+          lsx_min_f32(lsx_max_f32(vadd3, vec_zero_128), vec_threshold_128);
+      lsx_storeu_f32(tmp_data, lsx_mul_f32(vres0, vsum0));
+      lsx_storeu_f32(tmp_data + 4, lsx_mul_f32(vres1, vsum1));
+      lsx_storeu_f32(tmp_data + 8, lsx_mul_f32(vres2, vsum2));
+      lsx_storeu_f32(tmp_data + 12, lsx_mul_f32(vres3, vsum3));
+      tmp_data += 16;
+#endif
+    }
+    for (int j = 0; j < cnt_4; j++) {
+      __m128 vin0 = lsx_add_f32(lsx_loadu_f32(tmp_data), vec_bias_128);
+      __m128 vadd0 = lsx_add_f32(vin0, vec_offset_128);
+      __m128 vsum0 = lsx_mul_f32(vin0, vec_scale_128);
+      __m128 vres0 =
+          lsx_min_f32(lsx_max_f32(vadd0, vec_zero_128), vec_threshold_128);
+      lsx_storeu_f32(tmp_data, lsx_mul_f32(vres0, vsum0));
+      tmp_data += 4;
+    }
+    for (int j = 0; j < rem_4; j++) {
+      tmp_data[0] = tmp_data[0] + bias[i];
+      tmp_data[0] = std::min(std::max(0.f, tmp_data[0] + offset), threshold) *
+                    tmp_data[0] / scale;
+      tmp_data++;
+    }
+  }
+}
+
+static void activate_hardswish_inplace(
+    float *data, int len, float scale, float threshold, float offset) {
+#ifdef __loongarch_asx
+  int cnt = len >> 5;
+  int remain = len & 31;
+  __m256 vec_zero = lasx_set1_f32(0.f);
+  __m256 vec_scale = lasx_set1_f32(1.0 / scale);
+  __m256 vec_threshold = lasx_set1_f32(threshold);
+  __m256 vec_offset = lasx_set1_f32(offset);
+#else
+  int cnt = len >> 4;
+  int remain = len & 15;
+#endif
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+  __m128 vec_scale_128 = lsx_set1_f32(1.0 / scale);
+  __m128 vec_threshold_128 = lsx_set1_f32(threshold);
+  __m128 vec_offset_128 = lsx_set1_f32(offset);
+  int cnt_4 = remain >> 2;
+  int rem_4 = remain & 3;
+  float *tmp_data = data;
+  for (int i = 0; i < cnt; i++) {
+#ifdef __loongarch_asx
+    __m256 vin0 = lasx_loadu_f32(tmp_data);
+    __m256 vin1 = lasx_loadu_f32(tmp_data + 8);
+    __m256 vin2 = lasx_loadu_f32(tmp_data + 16);
+    __m256 vin3 = lasx_loadu_f32(tmp_data + 24);
+    __m256 vadd0 = lasx_add_f32(vin0, vec_offset);
+    __m256 vadd1 = lasx_add_f32(vin1, vec_offset);
+    __m256 vadd2 = lasx_add_f32(vin2, vec_offset);
+    __m256 vadd3 = lasx_add_f32(vin3, vec_offset);
+    __m256 vsum0 = lasx_mul_f32(vin0, vec_scale);
+    __m256 vsum1 = lasx_mul_f32(vin1, vec_scale);
+    __m256 vsum2 = lasx_mul_f32(vin2, vec_scale);
+    __m256 vsum3 = lasx_mul_f32(vin3, vec_scale);
+    __m256 vres0 = lasx_min_f32(lasx_max_f32(vadd0, vec_zero), vec_threshold);
+    __m256 vres1 = lasx_min_f32(lasx_max_f32(vadd1, vec_zero), vec_threshold);
+    __m256 vres2 = lasx_min_f32(lasx_max_f32(vadd2, vec_zero), vec_threshold);
+    __m256 vres3 = lasx_min_f32(lasx_max_f32(vadd3, vec_zero), vec_threshold);
+    lasx_storeu_f32(tmp_data, lasx_mul_f32(vres0, vsum0));
+    lasx_storeu_f32(tmp_data + 8, lasx_mul_f32(vres1, vsum1));
+    lasx_storeu_f32(tmp_data + 16, lasx_mul_f32(vres2, vsum2));
+    lasx_storeu_f32(tmp_data + 24, lasx_mul_f32(vres3, vsum3));
+    tmp_data += 32;
+#else
+    __m128 vin0 = lsx_loadu_f32(tmp_data);
+    __m128 vin1 = lsx_loadu_f32(tmp_data + 4);
+    __m128 vin2 = lsx_loadu_f32(tmp_data + 8);
+    __m128 vin3 = lsx_loadu_f32(tmp_data + 12);
+    __m128 vadd0 = lsx_add_f32(vin0, vec_offset_128);
+    __m128 vadd1 = lsx_add_f32(vin1, vec_offset_128);
+    __m128 vadd2 = lsx_add_f32(vin2, vec_offset_128);
+    __m128 vadd3 = lsx_add_f32(vin3, vec_offset_128);
+    __m128 vsum0 = lsx_mul_f32(vin0, vec_scale_128);
+    __m128 vsum1 = lsx_mul_f32(vin1, vec_scale_128);
+    __m128 vsum2 = lsx_mul_f32(vin2, vec_scale_128);
+    __m128 vsum3 = lsx_mul_f32(vin3, vec_scale_128);
+    __m128 vres0 =
+        lsx_min_f32(lsx_max_f32(vadd0, vec_zero_128), vec_threshold_128);
+    __m128 vres1 =
+        lsx_min_f32(lsx_max_f32(vadd1, vec_zero_128), vec_threshold_128);
+    __m128 vres2 =
+        lsx_min_f32(lsx_max_f32(vadd2, vec_zero_128), vec_threshold_128);
+    __m128 vres3 =
+        lsx_min_f32(lsx_max_f32(vadd3, vec_zero_128), vec_threshold_128);
+    lsx_storeu_f32(tmp_data, lsx_mul_f32(vres0, vsum0));
+    lsx_storeu_f32(tmp_data + 4, lsx_mul_f32(vres1, vsum1));
+    lsx_storeu_f32(tmp_data + 8, lsx_mul_f32(vres2, vsum2));
+    lsx_storeu_f32(tmp_data + 12, lsx_mul_f32(vres3, vsum3));
+    tmp_data += 16;
+#endif
+  }
+  for (int i = 0; i < cnt_4; i++) {
+    __m128 vin0 = lsx_loadu_f32(tmp_data);
+    __m128 vadd0 = lsx_add_f32(vin0, vec_offset_128);
+    __m128 vsum0 = lsx_mul_f32(vin0, vec_scale_128);
+    __m128 vres0 =
+        lsx_min_f32(lsx_max_f32(vadd0, vec_zero_128), vec_threshold_128);
+    lsx_storeu_f32(tmp_data, lsx_mul_f32(vres0, vsum0));
+    tmp_data += 4;
+  }
+  for (int i = 0; i < rem_4; i++) {
+    tmp_data[0] = std::min(std::max(0.f, tmp_data[0] + offset), threshold) *
+                  tmp_data[0] / scale;
+    tmp_data++;
+  }
+}
+
+static void activate_none_inplace_bias(float *data,
+                                       const float *bias,
+                                       int channel,
+                                       int channel_size) {
+  int i = 0;
+  int j = 0;
+  float *tmp_data = data;
+
+#ifdef __loongarch_asx
+  __m256 vec_bias = {0.f};
+  __m256 vec_data = {0.f};
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_bias_128 = {0.f};
+  __m128 vec_data_128 = {0.f};
+#endif
+
+  for (j = 0; j < channel; j++) {
+    i = 0;
+    tmp_data = data + j * channel_size;
+#ifdef __loongarch_asx
+    vec_bias = lasx_set1_f32(bias[j]);
+    for (; i + 7 < channel_size; i += 8) {
+      vec_data = lasx_loadu_f32(tmp_data + i);
+      vec_data = lasx_add_f32(vec_bias, vec_data);
+      lasx_storeu_f32(tmp_data + i, vec_data);
+    }
+#endif
+#ifdef __loongarch_sx
+    vec_bias_128 = lsx_set1_f32(bias[j]);
+    for (; i + 3 < channel_size; i += 4) {
+      vec_data_128 = lsx_loadu_f32(tmp_data + i);
+      vec_data_128 = lsx_add_f32(vec_data_128, vec_bias_128);
+      lsx_storeu_f32(tmp_data + i, vec_data_128);
+    }
+#endif
+    for (; i < channel_size; i++) {
+      tmp_data[i] += bias[j];
+    }
+  }
+}
+
+void fill_bias_act(float *tensor,
+                   const float *bias,
+                   int channel,
+                   int channel_size,
+                   bool flag_bias,
+                   const operators::ActivationParam *act_param) {
+  auto act_type = act_param->active_type;
+  float local_alpha = 0.f;
+  int len = channel * channel_size;
+
+  if ((act_param != nullptr) && (act_param->has_active)) {
+    if ((flag_bias) && (bias != nullptr)) {
+      // activate and bias
+      if (act_type == lite_api::ActivationType::kRelu) {
+        activate_relu_inplace_bias(
+            tensor, bias, channel, channel_size, local_alpha, 0);
+      } else if (act_type == lite_api::ActivationType::kRelu6) {
+        local_alpha = act_param->Relu_clipped_coef;
+        activate_relu_inplace_bias(
+            tensor, bias, channel, channel_size, local_alpha, 1);
+      } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+        local_alpha = act_param->Leaky_relu_alpha;
+        activate_lrelu_inplace_bias(
+            tensor, bias, channel, channel_size, local_alpha);
+      } else if (act_type == lite_api::ActivationType::kHardSwish) {
+        local_alpha = act_param->hard_swish_scale;
+        activate_hardswish_inplace_bias(tensor,
+                                        bias,
+                                        channel,
+                                        channel_size,
+                                        local_alpha,
+                                        act_param->hard_swish_threshold,
+                                        act_param->hard_swish_offset);
+      }
+    } else {
+      // activate
+      if (act_type == lite_api::ActivationType::kRelu) {
+        activate_relu_inplace(tensor, len, local_alpha, 0);
+      } else if (act_type == lite_api::ActivationType::kRelu6) {
+        local_alpha = act_param->Relu_clipped_coef;
+        activate_relu_inplace(tensor, len, local_alpha, 1);
+      } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+        local_alpha = act_param->Leaky_relu_alpha;
+        activate_lrelu_inplace(tensor, len, local_alpha);
+      } else if (act_type == lite_api::ActivationType::kHardSwish) {
+        local_alpha = act_param->hard_swish_scale;
+        activate_hardswish_inplace(tensor,
+                                   len,
+                                   local_alpha,
+                                   act_param->hard_swish_threshold,
+                                   act_param->hard_swish_offset);
+      }
+    }
+  } else {
+    // only add bias
+    if ((flag_bias) && (bias != nullptr))
+      activate_none_inplace_bias(tensor, bias, channel, channel_size);
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/fill_bias_activate.h b/lite/backends/loongarch/math/fill_bias_activate.h
new file mode 100644
index 00000000000..fee342740a2
--- /dev/null
+++ b/lite/backends/loongarch/math/fill_bias_activate.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void fill_bias_act(float* tensor,
+                   const float* bias,
+                   int channel,
+                   int channel_size,
+                   bool flag_bias,
+                   const operators::ActivationParam* act_param);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/gemm_s8u8_compute.cc b/lite/backends/loongarch/math/gemm_s8u8_compute.cc
new file mode 100644
index 00000000000..4d70082fc79
--- /dev/null
+++ b/lite/backends/loongarch/math/gemm_s8u8_compute.cc
@@ -0,0 +1,146 @@
+/* Copyright (c) 2021 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/gemm_s8u8_compute.h"
+#include <cmath>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <>
+void generate_gemm_s8u8_loongarch_kern<int8_t>::repack_bias(bool is_trans,
+                                                      int M,
+                                                      int K,
+                                                      const float *bias,
+                                                      float *out,
+                                                      float *Sa,
+                                                      float Sb,
+                                                      float Sc,
+                                                      const int8_t *A) {
+  const int8_t *a_ptr = A;
+  for (int i = 0; i < M; i++) {
+    float bias_val = bias ? bias[i] : 0.f;
+    float sum = 0.f;
+    float scale = (Sa[i] * Sb) * TRANS_INT8_UINT8_OFFT;
+    a_ptr = A + i * K;
+    if (is_trans) {
+      for (int j = 0; j < K; j++) {
+        sum += A[i + j * M] * scale;
+      }
+    } else {
+      for (int j = 0; j < K; j++) {
+        sum += a_ptr[j] * scale;
+      }
+    }
+    out[i] = bias_val - sum;
+    out[i] = out[i] / Sc;
+  }
+}
+
+template <>
+void generate_gemm_s8u8_loongarch_kern<float>::repack_bias(bool is_trans,
+                                                     int M,
+                                                     int K,
+                                                     const float *bias,
+                                                     float *out,
+                                                     float *Sa,
+                                                     float Sb,
+                                                     float Sc,
+                                                     const int8_t *A) {
+  const int8_t *a_ptr = A;
+  for (int i = 0; i < M; i++) {
+    float bias_val = bias ? bias[i] : 0.f;
+    float sum = 0.f;
+    float scale = (Sa[i] * Sb) * TRANS_INT8_UINT8_OFFT;
+    a_ptr = A + i * K;
+    if (is_trans) {
+      for (int j = 0; j < K; j++) {
+        sum += A[i + j * M] * scale;
+      }
+    } else {
+      for (int j = 0; j < K; j++) {
+        sum += a_ptr[j] * scale;
+      }
+    }
+    out[i] = bias_val - sum;
+  }
+}
+
+template <>
+void generate_gemm_s8u8_loongarch_kern<int8_t>::calc_scale(
+    int M, float *Sa, float Sb, float Sc, float *out) {
+  for (int i = 0; i < M; i++) {
+    out[i] = (Sa[i] * Sb) / Sc;
+  }
+}
+
+template <>
+void generate_gemm_s8u8_loongarch_kern<float>::calc_scale(
+    int M, float *Sa, float Sb, float Sc, float *out) {
+  for (int i = 0; i < M; i++) {
+    out[i] = (Sa[i] * Sb);
+  }
+}
+
+template <>
+void generate_gemm_s8u8_loongarch_kern<int8_t>::calc_block(
+    int M, int N, int K, int *blk_m, int *blk_n) {
+  int block_size, scale_tmp;
+  int block_m, block_n;
+
+  block_m = M;
+  block_n = 32 * _unroll_n;
+  // C(int8) + A(int8) + B(int8) + runtime packB(uint8)
+  block_size = block_m * block_n + _k_align4 * (block_m + 2 * block_n);
+  scale_tmp = static_cast<int>(ceil(block_size * 1.f / _l2_size));
+  scale_tmp = (scale_tmp + 1) / 2;
+  scale_tmp = scale_tmp * 2;
+  block_n = block_n / scale_tmp;
+  block_n = block_n / _unroll_n;
+  block_n = block_n * _unroll_n;
+  block_n = std::max(block_n, _unroll_n);
+
+  *blk_m = block_m;
+  *blk_n = block_n;
+}
+
+template <>
+void generate_gemm_s8u8_loongarch_kern<float>::calc_block(
+    int M, int N, int K, int *blk_m, int *blk_n) {
+  int block_size, scale_tmp;
+  int block_m, block_n;
+
+  block_m = M;
+  block_n = 32 * _unroll_n;
+  // C(int8) + A(int8) + B(int8) + runtime packB(uint8)
+  block_size =
+      block_m * block_n * sizeof(float) + _k_align4 * (block_m + 2 * block_n);
+  scale_tmp = static_cast<int>(ceil(block_size * 1.f / _l2_size));
+  scale_tmp = (scale_tmp + 1) / 2;
+  scale_tmp = scale_tmp * 2;
+  block_n = block_n / scale_tmp;
+  block_n = block_n / _unroll_n;
+  block_n = block_n * _unroll_n;
+  block_n = std::max(block_n, _unroll_n);
+
+  *blk_m = block_m;
+  *blk_n = block_n;
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/gemm_s8u8_compute.h b/lite/backends/loongarch/math/gemm_s8u8_compute.h
new file mode 100644
index 00000000000..2518ff3f5c9
--- /dev/null
+++ b/lite/backends/loongarch/math/gemm_s8u8_compute.h
@@ -0,0 +1,219 @@
+/* Copyright (c) 2021 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string.h>
+#include <algorithm>
+#include <cmath>
+#include "lite/backends/loongarch/math/gemm_s8u8_kernel.h"
+#include "lite/backends/loongarch/math/gemm_s8u8_pack.h"
+#include "lite/core/memory.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+#define PARAM_INIT               \
+  _is_trans_A = is_trans_A;      \
+  _is_trans_B = is_trans_B;      \
+  _M = M;                        \
+  _N = N;                        \
+  _K = K;                        \
+  _A = A;                        \
+  _ldc = ldc;                    \
+  _Sa = const_cast<float *>(Sa); \
+  _Sb = Sb;                      \
+  _Sc = Sc;                      \
+  _relu_type = relu_type;        \
+  _relu_alpha = relu_alpha;
+
+template <typename TYPE_C>
+class generate_gemm_s8u8_loongarch_kern {
+ public:
+  explicit generate_gemm_s8u8_loongarch_kern(bool is_trans_A,
+                                       bool is_trans_B,
+                                       int M,
+                                       int N,
+                                       int K,
+                                       const int8_t *A,
+                                       int ldc,
+                                       const float *Sa,
+                                       const float Sb,
+                                       const float Sc,
+                                       const float *bias,
+                                       int relu_type,
+                                       float relu_alpha) {
+    PARAM_INIT
+    gemm_int8_init(M, N, K, bias);
+  }
+
+  ~generate_gemm_s8u8_loongarch_kern() { gemm_int8_deinit(); }
+
+  void compute(const int8_t *A, const int8_t *B, TYPE_C *C) {
+    if (_relu_type < 0 || _relu_type > 3) {
+      LOG(FATAL) << "relu_type: 1 for relu, 2 for relu6, 3 for leakyrelu, but "
+                    "receive is "
+                 << _relu_type;
+    }
+
+    _B = B;
+    _C = C;
+    int loop_m, loop_n;
+    int block_m, block_n;
+    int min_m, min_n;
+    int8_t *cur_a = _pack_A;
+    const int8_t *cur_b = _B;
+    TYPE_C *cur_c = _C;
+    calc_block(_M, _N, _K, &block_m, &block_n);
+    for (loop_n = 0; loop_n < _N; loop_n += block_n) {
+      min_n = ((_N - loop_n) >= block_n) ? block_n : (_N - loop_n);
+      cur_b = _is_trans_B ? (_B + loop_n * _K) : (_B + loop_n);
+      int step = _is_trans_B ? _K : _N;
+      packB_i82u8(min_n, _K, step, cur_b, _pack_B, _is_trans_B);
+
+      for (loop_m = 0; loop_m < _M; loop_m += block_m) {
+        min_m = ((_M - loop_m) >= block_m) ? block_m : (_M - loop_m);
+        cur_a = _pack_A + loop_m * _k_align4;
+        cur_c = _C + loop_m * _ldc + loop_n;
+
+        // kernel
+        gemm_kernel_loop_int8(min_m,
+                              min_n,
+                              _K,
+                              cur_a,
+                              _pack_B,
+                              cur_c,
+                              _ldc,
+                              _scale + loop_m,
+                              _re_bias + loop_m,
+                              _relu_type,
+                              _relu_alpha);
+      }
+    }
+  }
+
+ private:
+  // inner param
+  int _k_align4;
+  int _relu_type;
+  int _M, _N, _K, _ldc;
+  float _Sb, _Sc;
+  float _relu_alpha;
+  bool _C_is_int8;
+  bool _is_trans_A;
+  bool _is_trans_B;
+  // divide block param
+  const int _unroll_n = 32;
+  const int _unroll_m = 2;
+  const int _l2_size = 262144;  // 256K
+  // work buffer
+  TYPE_C *_C{nullptr};
+  float *_Sa{nullptr};
+  float *_scale{nullptr};
+  float *_in_bias{nullptr};
+  float *_re_bias{nullptr};
+  int8_t *_pack_A{nullptr};
+  uint8_t *_pack_B{nullptr};
+  const int8_t *_A{nullptr};
+  const int8_t *_B{nullptr};
+
+  // prepare input data
+  void repack_bias(bool is_trans,
+                   int M,
+                   int K,
+                   const float *bias,
+                   float *out,
+                   float *Sa,
+                   float Sb,
+                   float Sc,
+                   const int8_t *A);
+
+  void calc_scale(int M, float *Sa, float Sb, float Sc, float *out);
+
+  // pack A
+  void prepackA_i8(
+      int M, int K, const int8_t *A, int8_t *pack_A, bool is_trans) {
+    memset(pack_A, 0, _M * _k_align4);  // important, can't delete
+    gemm_s8u8s8_prepackA(M, K, A, pack_A, is_trans);
+  }
+
+  // pack B
+  void packB_i82u8(int N,
+                   int K,
+                   int stride,
+                   const int8_t *B,
+                   uint8_t *pack_B,
+                   bool is_trans) {
+    gemm_s8u8s8_runpackB(N, K, stride, B, pack_B, is_trans);
+  }
+
+  void gemm_int8_init(int M, int N, int K, const float *bias) {
+    int K_align4 = (K + 3) >> 2;
+    int block_n = 0;
+    int block_m = 0;
+    // calc block according to L2 size
+    K_align4 = K_align4 << 2;
+    _k_align4 = K_align4;
+    calc_block(M, N, K, &block_m, &block_n);
+    // malloc work_buf
+    _pack_A = reinterpret_cast<int8_t *>(
+        TargetMalloc(TARGET(kLoongArch), block_m * K_align4));
+    _pack_B = reinterpret_cast<uint8_t *>(
+        TargetMalloc(TARGET(kLoongArch), block_n * K_align4));
+    _re_bias = reinterpret_cast<float *>(
+        TargetMalloc(TARGET(kLoongArch), M * sizeof(float)));
+    _scale = reinterpret_cast<float *>(
+        TargetMalloc(TARGET(kLoongArch), M * sizeof(float)));
+    // if no bias, malloc a buffer and set all zero.
+    if (bias == nullptr) {
+      _in_bias = reinterpret_cast<float *>(
+          TargetMalloc(TARGET(kLoongArch), M * sizeof(float)));
+      memset(_in_bias, 0, M * sizeof(float));
+      repack_bias(_is_trans_A, M, K, _in_bias, _re_bias, _Sa, _Sb, _Sc, _A);
+    } else {
+      repack_bias(_is_trans_A, M, K, bias, _re_bias, _Sa, _Sb, _Sc, _A);
+    }
+    calc_scale(M, _Sa, _Sb, _Sc, _scale);
+    prepackA_i8(M, K, _A, _pack_A, _is_trans_A);
+  }
+
+  void gemm_int8_deinit() {
+    if (_pack_A != nullptr) {
+      TargetFree(TARGET(kLoongArch), _pack_A);
+    }
+    if (_pack_B != nullptr) {
+      TargetFree(TARGET(kLoongArch), _pack_B);
+    }
+    if (_re_bias != nullptr) {
+      TargetFree(TARGET(kLoongArch), _re_bias);
+    }
+    if (_scale != nullptr) {
+      TargetFree(TARGET(kLoongArch), _scale);
+    }
+    if (_in_bias != nullptr) {
+      TargetFree(TARGET(kLoongArch), _in_bias);
+    }
+  }
+
+  void calc_block(int M, int N, int K, int *blk_m, int *blk_n);
+};
+
+#undef PARAM_INIT
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/gemm_s8u8_kernel.cc b/lite/backends/loongarch/math/gemm_s8u8_kernel.cc
new file mode 100644
index 00000000000..f289cfd9e88
--- /dev/null
+++ b/lite/backends/loongarch/math/gemm_s8u8_kernel.cc
@@ -0,0 +1,936 @@
+/* Copyright (c) 2021 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __loongarch_asx
+
+#include "lite/backends/loongarch/math/gemm_s8u8_kernel.h"
+#include "lite/backends/loongarch/xxl.h"
+#include <stdint.h>
+#include <algorithm>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+//********************** activte and bias function **************************
+void gemm_fuse_relu_bias(__m256* vec_data,
+                         __m256 vec_bias,
+                         __m256 vec_alph,
+                         __m256 vec_zero,
+                         int act_mode) {
+  const int cmp_le_os = 2;
+  __m256 vec_lr, vec_mask;
+  *vec_data = lasx_add_f32(*vec_data, vec_bias);
+  switch (act_mode) {
+    case 1:
+      *vec_data = lasx_max_f32(*vec_data, vec_zero);  // relu
+      break;
+    case 2:
+      *vec_data =
+          lasx_min_f32(lasx_max_f32(*vec_data, vec_zero), vec_alph);  // relu6
+      break;
+    case 3:
+      vec_lr = lasx_mul_f32(vec_alph, *vec_data);  // lrelu
+      vec_mask = lasx_cmp_f32(*vec_data, vec_zero, cmp_le_os);
+      *vec_data = lasx_blendv_f32(*vec_data, vec_lr, vec_mask);
+      break;
+    default:
+      break;
+  }
+}
+
+void gemm_fuse_relu_bias_128(__m128* vec_data,
+                             __m128 vec_bias,
+                             __m128 vec_alph,
+                             __m128 vec_zero,
+                             int act_mode) {
+  __m128 vec_lr_128, vec_mask_128;
+  *vec_data = lsx_add_f32(*vec_data, vec_bias);
+  switch (act_mode) {
+    case 1:
+      *vec_data = lsx_max_f32(*vec_data, vec_zero);
+      break;
+    case 2:
+      *vec_data = lsx_min_f32(lsx_max_f32(*vec_data, vec_zero), vec_alph);
+      break;
+    case 3:
+      vec_lr_128 = lsx_mul_f32(*vec_data, vec_alph);
+      vec_mask_128 = lsx_cmple_f32(*vec_data, vec_zero);
+      *vec_data = lsx_blendv_f32(*vec_data, vec_lr_128, vec_mask_128);
+      break;
+    default:
+      break;
+  }
+}
+
+void gemm_fuse_relu_bias_f32(float* data,
+                             float bias,
+                             float alph,
+                             int act_mode) {
+  *data += bias;
+  switch (act_mode) {
+    case 1:
+      *data = std::max(*data, 0.f);
+      break;
+    case 2:
+      *data = std::min(std::max(*data, 0.f), alph);
+      break;
+    case 3:
+      *data = *data > 0.f ? *data : alph * *data;
+      break;
+    default:
+      break;
+  }
+}
+
+#define ACT_RELU_BIAS(data, bias, mode) \
+  gemm_fuse_relu_bias(&data, bias, vec_alph, vec_zero, mode);
+
+#define ACT_RELU_BIAS_128(data, bias, mode) \
+  gemm_fuse_relu_bias_128(&data, bias, vec_alph_128, vec_zero_128, mode);
+
+#define ACT_RELU_BIAS_FP32(data, bias, mode) \
+  gemm_fuse_relu_bias_f32(&data, bias, relu_alpha, mode);
+
+//******************************** marco ************************************
+#define CLIP_BORDER_LEFT (-127)
+#define CLIP_BORDER_RIGHT (127)
+
+#define CLIP_S8(a)     \
+  static_cast<int8_t>( \
+      std::min(std::max(a, CLIP_BORDER_LEFT), CLIP_BORDER_RIGHT))
+
+#define FLOAT2INT(a) \
+  a > 0 ? static_cast<int>(a + 0.5f) : static_cast<int>(a - 0.5f)
+
+// extra 2 regs
+#define LASX_DOT_U8S8(dst, src1, src2, vec_tmp_marco)          \
+  vec_tmp_marco = lasx_maddubs_i16(src1, src2);              \
+  vec_tmp_marco = lasx_madd_i16(vec_tmp_marco, vec_one_s16); \
+  dst = lasx_add_i32(dst, vec_tmp_marco);
+
+#define LSX_DOT_U8S8(dst, src1, src2, vec_tmp_marco)          \
+  vec_tmp_marco = lsx_maddubs_i16(src1, src2);              \
+  vec_tmp_marco = lsx_madd_i16(vec_tmp_marco, vec_one_128); \
+  dst = lsx_add_i32(dst, vec_tmp_marco);
+
+// 32 int to 32 int8
+#define INT32x32_2_INT8x32(out, in1, in2, in3, in4)                 \
+  {                                                                 \
+    in1 = lasx_packs_i32(in1, in2);                             \
+    in3 = lasx_packs_i32(in3, in4);                             \
+    in4 = lasx_packs_i16(in1, in3);                             \
+    __m128i hi_in = lasx_extractf128_m256i(in4, 1);               \
+    __m128i vec_i32_2_i8_tmp =                                      \
+        lsx_unpacklo_i32(lasx_castm256i_m128i(in4), hi_in);     \
+    hi_in = lsx_unpackhi_i32(lasx_castm256i_m128i(in4), hi_in); \
+    out = lasx_inserti128_m256i(out, vec_i32_2_i8_tmp, 0);        \
+    out = lasx_inserti128_m256i(out, hi_in, 1);                   \
+    out = lasx_max_i8(out, vec_mins_127);                       \
+  }
+
+// BroadCast K4 8-bit data to 8 lanes
+#define SET_A(i, offt) \
+  vec_A##i = lasx_set1_i32(*reinterpret_cast<int*>(a_ptr + offt));
+
+// BroadCast K4 8-bit data to 4 lanes
+#define SET_A_128(i, offt) \
+  vec_A##i##_128 = lsx_set1_i32(*reinterpret_cast<int*>(a_ptr + offt));
+
+// Load K4xN8 8-bit data, total 256 bits
+#define LOAD_B(i, offt) \
+  vec_B##i = lasx_loadu_m256i(reinterpret_cast<__m256i const*>(b_ptr + offt));
+
+// Load K4xN4 8-bit data, total 128 bits
+#define LOAD_B_128(i, offt) \
+  vec_B##i##_128 =          \
+      lsx_loadu_m128i(reinterpret_cast<__m128i const*>(b_ptr + offt));
+
+#define SUDOT(c, b, a) LASX_DOT_U8S8(vec_C##c, vec_B##b, vec_A##a, vec_tmp)
+
+#define SUDOT_128(c, b, a) \
+  LSX_DOT_U8S8(vec_C##c##_128, vec_B##b##_128, vec_A##a##_128, vec_tmp_128)
+
+#define INIT_C                     \
+  vec_C0 = lasx_setzero_m256i(); \
+  vec_C1 = lasx_setzero_m256i(); \
+  vec_C2 = lasx_setzero_m256i(); \
+  vec_C3 = lasx_setzero_m256i(); \
+  vec_C4 = lasx_setzero_m256i(); \
+  vec_C5 = lasx_setzero_m256i(); \
+  vec_C6 = lasx_setzero_m256i(); \
+  vec_C7 = lasx_setzero_m256i();
+
+#define INIT_C_128                  \
+  vec_C0_128 = lsx_setzero_m128i(); \
+  vec_C1_128 = lsx_setzero_m128i();
+
+#define KERN_2x32                                                             \
+  SET_A(0, 0)                                                                 \
+  SET_A(1, 4)                                                                 \
+  LOAD_B(0, 0)                                                                \
+  LOAD_B(1, 32)                                                               \
+  LOAD_B(2, 64)                                                               \
+  LOAD_B(3, 96)                                                               \
+  SUDOT(0, 0, 0)                                                              \
+  SUDOT(1, 1, 0)                                                              \
+  SUDOT(2, 2, 0)                                                              \
+  SUDOT(3, 3, 0)                                                              \
+  SUDOT(4, 0, 1) SUDOT(5, 1, 1) SUDOT(6, 2, 1) SUDOT(7, 3, 1) a_ptr += 2 * 4; \
+  b_ptr += 32 * 4;
+
+#define KERN_1x32                                                         \
+  SET_A(0, 0)                                                             \
+  LOAD_B(0, 0)                                                            \
+  LOAD_B(1, 32)                                                           \
+  LOAD_B(2, 64)                                                           \
+  LOAD_B(3, 96)                                                           \
+  SUDOT(0, 0, 0) SUDOT(1, 1, 0) SUDOT(2, 2, 0) SUDOT(3, 3, 0) a_ptr += 4; \
+  b_ptr += 32 * 4;
+
+#define KERN_2x24                                                             \
+  SET_A(0, 0)                                                                 \
+  SET_A(1, 4)                                                                 \
+  LOAD_B(0, 0)                                                                \
+  LOAD_B(1, 32)                                                               \
+  LOAD_B(2, 64)                                                               \
+  SUDOT(0, 0, 0)                                                              \
+  SUDOT(1, 1, 0)                                                              \
+  SUDOT(2, 2, 0) SUDOT(4, 0, 1) SUDOT(5, 1, 1) SUDOT(6, 2, 1) a_ptr += 2 * 4; \
+  b_ptr += 24 * 4;
+
+#define KERN_1x24                                                        \
+  SET_A(0, 0)                                                            \
+  LOAD_B(0, 0)                                                           \
+  LOAD_B(1, 32)                                                          \
+  LOAD_B(2, 64) SUDOT(0, 0, 0) SUDOT(1, 1, 0) SUDOT(2, 2, 0) a_ptr += 4; \
+  b_ptr += 24 * 4;
+
+#define KERN_2x16                                                             \
+  SET_A(0, 0)                                                                 \
+  SET_A(1, 4)                                                                 \
+  LOAD_B(0, 0)                                                                \
+  LOAD_B(1, 32)                                                               \
+  SUDOT(0, 0, 0) SUDOT(1, 1, 0) SUDOT(4, 0, 1) SUDOT(5, 1, 1) a_ptr += 2 * 4; \
+  b_ptr += 16 * 4;
+
+#define KERN_1x16                                                      \
+  SET_A(0, 0)                                                          \
+  LOAD_B(0, 0) LOAD_B(1, 32) SUDOT(0, 0, 0) SUDOT(1, 1, 0) a_ptr += 4; \
+  b_ptr += 16 * 4;
+
+#define KERN_2x8                                                         \
+  SET_A(0, 0)                                                            \
+  SET_A(1, 4) LOAD_B(0, 0) SUDOT(0, 0, 0) SUDOT(4, 0, 1) a_ptr += 2 * 4; \
+  b_ptr += 8 * 4;
+
+#define KERN_1x8 \
+  SET_A(0, 0)    \
+  LOAD_B(0, 0)   \
+  SUDOT(0, 0, 0) \
+  a_ptr += 4;    \
+  b_ptr += 8 * 4;
+
+#define KERN_2x4                                                         \
+  SET_A_128(0, 0)                                                        \
+  SET_A_128(1, 4)                                                        \
+  LOAD_B_128(0, 0) SUDOT_128(0, 0, 0) SUDOT_128(1, 0, 1) a_ptr += 2 * 4; \
+  b_ptr += 4 * 4;
+
+#define KERN_1x4     \
+  SET_A_128(0, 0)    \
+  LOAD_B_128(0, 0)   \
+  SUDOT_128(0, 0, 0) \
+  a_ptr += 4;        \
+  b_ptr += 4 * 4;
+
+#define KERN_2x2                                                         \
+  SET_A_128(0, 0)                                                        \
+  SET_A_128(1, 4)                                                        \
+  vec_B0_128 = lsx_loadl_i64(reinterpret_cast<__m128i const*>(b_ptr)); \
+  SUDOT_128(0, 0, 0) SUDOT_128(1, 0, 1) a_ptr += 2 * 4;                  \
+  b_ptr += 2 * 4;
+
+#define KERN_1x2                                                         \
+  SET_A_128(0, 0)                                                        \
+  vec_B0_128 = lsx_loadl_i64(reinterpret_cast<__m128i const*>(b_ptr)); \
+  SUDOT_128(0, 0, 0)                                                     \
+  a_ptr += 4;                                                            \
+  b_ptr += 2 * 4;
+
+#define STORE_32(in0, in1, in2, in3, i)                                \
+  dst_vec_ps0 = lasx_mul_f32(lasx_cvti32_f32(in0), vec_scale[i]);  \
+  dst_vec_ps1 = lasx_mul_f32(lasx_cvti32_f32(in1), vec_scale[i]);  \
+  dst_vec_ps2 = lasx_mul_f32(lasx_cvti32_f32(in2), vec_scale[i]);  \
+  dst_vec_ps3 = lasx_mul_f32(lasx_cvti32_f32(in3), vec_scale[i]);  \
+  ACT_RELU_BIAS(dst_vec_ps0, vec_bias[i], relu_type)                   \
+  ACT_RELU_BIAS(dst_vec_ps1, vec_bias[i], relu_type)                   \
+  ACT_RELU_BIAS(dst_vec_ps2, vec_bias[i], relu_type)                   \
+  ACT_RELU_BIAS(dst_vec_ps3, vec_bias[i], relu_type)                   \
+  in0 = lasx_cvtf32_i32(dst_vec_ps0);                               \
+  in1 = lasx_cvtf32_i32(dst_vec_ps1);                               \
+  in2 = lasx_cvtf32_i32(dst_vec_ps2);                               \
+  in3 = lasx_cvtf32_i32(dst_vec_ps3);                               \
+  INT32x32_2_INT8x32(dst_vec, in0, in1, in2, in3) lasx_storeu_m256i( \
+      reinterpret_cast<__m256i*>(c_ptr + i * ldc), dst_vec);
+
+#define STORE_24(in0, in1, in2, in3, i)                                   \
+  dst_vec_ps0 = lasx_mul_f32(lasx_cvti32_f32(in0), vec_scale[i]);     \
+  dst_vec_ps1 = lasx_mul_f32(lasx_cvti32_f32(in1), vec_scale[i]);     \
+  dst_vec_ps2 = lasx_mul_f32(lasx_cvti32_f32(in2), vec_scale[i]);     \
+  ACT_RELU_BIAS(dst_vec_ps0, vec_bias[i], relu_type)                      \
+  ACT_RELU_BIAS(dst_vec_ps1, vec_bias[i], relu_type)                      \
+  ACT_RELU_BIAS(dst_vec_ps2, vec_bias[i], relu_type)                      \
+  in0 = lasx_cvtf32_i32(dst_vec_ps0);                                  \
+  in1 = lasx_cvtf32_i32(dst_vec_ps1);                                  \
+  in2 = lasx_cvtf32_i32(dst_vec_ps2);                                  \
+  INT32x32_2_INT8x32(dst_vec, in0, in1, in2, in3) lasx_maskstore_i32( \
+      reinterpret_cast<int*>(c_ptr + i * ldc), vec_mask, dst_vec);
+
+#define STORE_16(in0, in1, in2, in3, i)                               \
+  dst_vec_ps0 = lasx_mul_f32(lasx_cvti32_f32(in0), vec_scale[i]); \
+  dst_vec_ps1 = lasx_mul_f32(lasx_cvti32_f32(in1), vec_scale[i]); \
+  ACT_RELU_BIAS(dst_vec_ps0, vec_bias[i], relu_type)                  \
+  ACT_RELU_BIAS(dst_vec_ps1, vec_bias[i], relu_type)                  \
+  in0 = lasx_cvtf32_i32(dst_vec_ps0);                              \
+  in1 = lasx_cvtf32_i32(dst_vec_ps1);                              \
+  INT32x32_2_INT8x32(dst_vec, in0, in1, in2, in3) dst_vec_128 =       \
+      lasx_castm256i_m128i(dst_vec);                                \
+  lsx_storeu_m128i(reinterpret_cast<__m128i*>(c_ptr + i * ldc), dst_vec_128);
+
+#define STORE_8(in0, in1, in2, in3, i)                                \
+  dst_vec_ps0 = lasx_mul_f32(lasx_cvti32_f32(in0), vec_scale[i]); \
+  ACT_RELU_BIAS(dst_vec_ps0, vec_bias[i], relu_type)                  \
+  in0 = lasx_cvtf32_i32(dst_vec_ps0);                              \
+  INT32x32_2_INT8x32(dst_vec, in0, in1, in2, in3) dst_vec_128 =       \
+      lasx_castm256i_m128i(dst_vec);                                \
+  lsx_storel_pi(reinterpret_cast<__m64*>(c_ptr + i * ldc),            \
+                lsx_castm128i_f32(dst_vec_128));
+
+// __m128
+#define STORE_4(in0, i)                                                   \
+  {                                                                       \
+    dst_vec_ps0_128 = lsx_mul_f32(lsx_cvti32_f32(in0), vec_scale_128[i]); \
+    ACT_RELU_BIAS_128(dst_vec_ps0_128, vec_bias_128[i], relu_type)        \
+    in0 = lsx_cvtf32_i32(dst_vec_ps0_128);                               \
+    in0 = lsx_min_i32(lsx_max_i32(in0, vec_left), vec_right);         \
+    int* ptr = reinterpret_cast<int*>(&in0);                              \
+    *(c_ptr + i * ldc) = static_cast<int8_t>(ptr[0]);                     \
+    *(c_ptr + i * ldc + 1) = static_cast<int8_t>(ptr[1]);                 \
+    *(c_ptr + i * ldc + 2) = static_cast<int8_t>(ptr[2]);                 \
+    *(c_ptr + i * ldc + 3) = static_cast<int8_t>(ptr[3]);                 \
+  }
+
+#define STORE_2(in0, i)                                      \
+  {                                                          \
+    int* in0_ptr = reinterpret_cast<int*>(&in0);             \
+    float bias_data = (*(bias_ptr + idx_m + i));             \
+    float in0_f32 = in0_ptr[0] * (*(scale_ptr + idx_m + i)); \
+    ACT_RELU_BIAS_FP32(in0_f32, bias_data, relu_type)        \
+    int in0_int = FLOAT2INT(in0_f32);                        \
+    *(c_ptr + i * ldc) = CLIP_S8(in0_int);                   \
+    in0_f32 = in0_ptr[1] * (*(scale_ptr + idx_m + i));       \
+    ACT_RELU_BIAS_FP32(in0_f32, bias_data, relu_type)        \
+    in0_int = FLOAT2INT(in0_f32);                            \
+    *(c_ptr + i * ldc + 1) = CLIP_S8(in0_int);               \
+  }
+
+void gemm_kernel_loop_int8(int M,
+                           int N,
+                           int K,
+                           int8_t* A,
+                           uint8_t* B,
+                           int8_t* C,
+                           int ldc,
+                           const float* scale,
+                           const float* bias,
+                           int relu_type,
+                           float relu_alpha) {
+  int8_t* a_ptr = A;
+  int8_t* c_ptr = C;
+  uint8_t* b_ptr = B;
+  const float* scale_ptr = scale;
+  const float* bias_ptr = bias;
+  int k_loop = (K + 3) >> 2;
+  int pack_k = k_loop << 2;
+  int idx_n = 0, idx_m = 0, idx_k = 0;
+
+  // total 16 regs
+  __m256i vec_C0, vec_C1, vec_C2, vec_C3;
+  __m256i vec_C4, vec_C5, vec_C6, vec_C7;
+  __m256i vec_B0, vec_B1, vec_B2, vec_B3;
+  __m256i vec_A0, vec_A1, vec_tmp;
+  __m256i vec_one_s16 = lasx_set1_i16(static_cast<int16_t>(1));
+  // save result
+  __m256i dst_vec;
+  __m256 vec_bias[2];
+  __m256 vec_scale[2];
+  __m256 dst_vec_ps0, dst_vec_ps1, dst_vec_ps2, dst_vec_ps3;
+  // bias and relu
+  __m256 vec_alph = lasx_set1_f32(relu_alpha);
+  __m256 vec_zero = lasx_set1_f32(0.f);
+  // val is in -127, 127, the other side using packs to guarantee
+  __m256i vec_mins_127 = lasx_set1_i8(static_cast<char>(CLIP_BORDER_LEFT));
+
+  __m128i vec_C0_128, vec_C1_128;
+  __m128i vec_B0_128;
+  __m128i vec_A0_128, vec_A1_128, vec_tmp_128;
+  __m128i vec_one_128 = lsx_set1_i16(static_cast<int16_t>(1));
+  // save result
+  __m128i dst_vec_128;
+  __m128 vec_bias_128[2];
+  __m128 vec_scale_128[2];
+  __m128 dst_vec_ps0_128;
+  // bias and relu
+  __m128 vec_alph_128 = lsx_set1_f32(relu_alpha);
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+  // clip
+  __m128i vec_left = lsx_set1_i32(static_cast<int>(CLIP_BORDER_LEFT));
+  __m128i vec_right = lsx_set1_i32(static_cast<int>(CLIP_BORDER_RIGHT));
+
+  // mask load, store
+  int mask0[8] = {-1, -1, -1, -1, -1, -1, 0, 0};  // load or save 24 int8-data
+  __m256i vec_mask =
+      lasx_loadu_m256i(reinterpret_cast<__m256i const*>(mask0));
+
+  // block A
+  for (idx_m = 0; idx_m + 1 < M; idx_m += 2) {
+    c_ptr = C;
+    b_ptr = B;
+    a_ptr = A;
+    C += 2 * ldc;
+
+    // bias and scale
+    vec_bias[0] = lasx_set1_f32(*(bias_ptr + idx_m));
+    vec_bias[1] = lasx_set1_f32(*(bias_ptr + idx_m + 1));
+    vec_scale[0] = lasx_set1_f32(*(scale_ptr + idx_m));
+    vec_scale[1] = lasx_set1_f32(*(scale_ptr + idx_m + 1));
+    vec_bias_128[0] = lsx_set1_f32(*(bias_ptr + idx_m));
+    vec_bias_128[1] = lsx_set1_f32(*(bias_ptr + idx_m + 1));
+    vec_scale_128[0] = lsx_set1_f32(*(scale_ptr + idx_m));
+    vec_scale_128[1] = lsx_set1_f32(*(scale_ptr + idx_m + 1));
+
+    // block B
+    for (idx_n = 0; idx_n + 31 < N; idx_n += 32) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x32
+      }
+      STORE_32(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+      STORE_32(vec_C4, vec_C5, vec_C6, vec_C7, 1)
+      c_ptr += 32;
+    }
+    for (; idx_n + 23 < N; idx_n += 24) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x24
+      }
+      STORE_24(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+      STORE_24(vec_C4, vec_C5, vec_C6, vec_C7, 1)
+      c_ptr += 24;
+    }
+    for (; idx_n + 15 < N; idx_n += 16) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x16
+      }
+      STORE_16(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+      STORE_16(vec_C4, vec_C5, vec_C6, vec_C7, 1)
+      c_ptr += 16;
+    }
+    for (; idx_n + 7 < N; idx_n += 8) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x8
+      }
+      STORE_8(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+      STORE_8(vec_C4, vec_C5, vec_C6, vec_C7, 1)
+      c_ptr += 8;
+    }
+    for (; idx_n + 3 < N; idx_n += 4) {
+      a_ptr = A;
+      INIT_C_128
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x4
+      }
+      STORE_4(vec_C0_128, 0)
+      STORE_4(vec_C1_128, 1)
+      c_ptr += 4;
+    }
+    for (; idx_n + 1 < N; idx_n += 2) {
+      a_ptr = A;
+      INIT_C_128
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x2
+      }
+      STORE_2(vec_C0_128, 0)
+      STORE_2(vec_C1_128, 1)
+      c_ptr += 2;
+    }
+    for (; idx_n < N; idx_n++) {
+      a_ptr = A;
+      float acc0 = 0;
+      float acc1 = 0;
+      float bias0 = (*(bias_ptr + idx_m));
+      float bias1 = (*(bias_ptr + idx_m + 1));
+      float scale0 = (*(scale_ptr + idx_m));
+      float scale1 = (*(scale_ptr + idx_m + 1));
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        for (int k = 0; k < 4; k++) {
+          acc0 +=
+              static_cast<int>(a_ptr[k]) * static_cast<int>(b_ptr[k]) * scale0;
+          acc1 += static_cast<int>(a_ptr[k + 4]) * static_cast<int>(b_ptr[k]) *
+                  scale1;
+        }
+        a_ptr += 2 * 4;
+        b_ptr += 4;
+      }
+      ACT_RELU_BIAS_FP32(acc0, bias0, relu_type)
+      ACT_RELU_BIAS_FP32(acc1, bias1, relu_type)
+      int iacc0 = FLOAT2INT(acc0);
+      int iacc1 = FLOAT2INT(acc1);
+      int8_t acc0_s8 = CLIP_S8(iacc0);
+      int8_t acc1_s8 = CLIP_S8(iacc1);
+      c_ptr[0] = acc0_s8;
+      c_ptr[ldc] = acc1_s8;
+      c_ptr++;
+    }
+    A += 2 * pack_k;
+  }
+  for (; idx_m < M; idx_m += 1) {
+    c_ptr = C;
+    b_ptr = B;
+    a_ptr = A;
+    C += ldc;
+
+    // bias and scale
+    vec_bias[0] = lasx_set1_f32(*(bias_ptr + idx_m));
+    vec_scale[0] = lasx_set1_f32(*(scale_ptr + idx_m));
+    vec_bias_128[0] = lsx_set1_f32(*(bias_ptr + idx_m));
+    vec_scale_128[0] = lsx_set1_f32(*(scale_ptr + idx_m));
+
+    // block B
+    for (idx_n = 0; idx_n + 31 < N; idx_n += 32) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x32
+      }
+      STORE_32(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+      c_ptr += 32;
+    }
+    for (; idx_n + 23 < N; idx_n += 24) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x24
+      }
+      STORE_24(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+      c_ptr += 24;
+    }
+    for (; idx_n + 15 < N; idx_n += 16) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x16
+      }
+      STORE_16(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+      c_ptr += 16;
+    }
+    for (; idx_n + 7 < N; idx_n += 8) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x8
+      }
+      STORE_8(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+      c_ptr += 8;
+    }
+    for (; idx_n + 3 < N; idx_n += 4) {
+      a_ptr = A;
+      INIT_C_128
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x4
+      }
+      STORE_4(vec_C0_128, 0)
+      c_ptr += 4;
+    }
+    for (; idx_n + 1 < N; idx_n += 2) {
+      a_ptr = A;
+      INIT_C_128
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x2
+      }
+      STORE_2(vec_C0_128, 0)
+      c_ptr += 2;
+    }
+    for (; idx_n < N; idx_n++) {
+      a_ptr = A;
+      float acc0 = 0;
+      float bias0 = (*(bias_ptr + idx_m));
+      float scale0 = (*(scale_ptr + idx_m));
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        for (int k = 0; k < 4; k++) {
+          acc0 +=
+              static_cast<int>(a_ptr[k]) * static_cast<int>(b_ptr[k]) * scale0;
+        }
+        a_ptr += 4;
+        b_ptr += 4;
+      }
+      ACT_RELU_BIAS_FP32(acc0, bias0, relu_type)
+      int iacc0 = FLOAT2INT(acc0);
+      int8_t acc0_s8 = CLIP_S8(iacc0);
+      c_ptr[0] = acc0_s8;
+      c_ptr++;
+    }
+    A += pack_k;
+  }
+}
+
+#define STORE_32_float(in0, in1, in2, in3, i)                         \
+  dst_vec_ps0 = lasx_mul_f32(lasx_cvti32_f32(in0), vec_scale[i]); \
+  dst_vec_ps1 = lasx_mul_f32(lasx_cvti32_f32(in1), vec_scale[i]); \
+  dst_vec_ps2 = lasx_mul_f32(lasx_cvti32_f32(in2), vec_scale[i]); \
+  dst_vec_ps3 = lasx_mul_f32(lasx_cvti32_f32(in3), vec_scale[i]); \
+  ACT_RELU_BIAS(dst_vec_ps0, vec_bias[i], relu_type)                  \
+  ACT_RELU_BIAS(dst_vec_ps1, vec_bias[i], relu_type)                  \
+  ACT_RELU_BIAS(dst_vec_ps2, vec_bias[i], relu_type)                  \
+  ACT_RELU_BIAS(dst_vec_ps3, vec_bias[i], relu_type)                  \
+  lasx_storeu_f32(c_ptr + i * ldc, dst_vec_ps0);                     \
+  lasx_storeu_f32(c_ptr + i * ldc + 8, dst_vec_ps1);                 \
+  lasx_storeu_f32(c_ptr + i * ldc + 16, dst_vec_ps2);                \
+  lasx_storeu_f32(c_ptr + i * ldc + 24, dst_vec_ps3);
+
+#define STORE_24_float(in0, in1, in2, in3, i)                         \
+  dst_vec_ps0 = lasx_mul_f32(lasx_cvti32_f32(in0), vec_scale[i]); \
+  dst_vec_ps1 = lasx_mul_f32(lasx_cvti32_f32(in1), vec_scale[i]); \
+  dst_vec_ps2 = lasx_mul_f32(lasx_cvti32_f32(in2), vec_scale[i]); \
+  ACT_RELU_BIAS(dst_vec_ps0, vec_bias[i], relu_type)                  \
+  ACT_RELU_BIAS(dst_vec_ps1, vec_bias[i], relu_type)                  \
+  ACT_RELU_BIAS(dst_vec_ps2, vec_bias[i], relu_type)                  \
+  lasx_storeu_f32(c_ptr + i * ldc, dst_vec_ps0);                     \
+  lasx_storeu_f32(c_ptr + i * ldc + 8, dst_vec_ps1);                 \
+  lasx_storeu_f32(c_ptr + i * ldc + 16, dst_vec_ps2);
+
+#define STORE_16_float(in0, in1, in2, in3, i)                         \
+  dst_vec_ps0 = lasx_mul_f32(lasx_cvti32_f32(in0), vec_scale[i]); \
+  dst_vec_ps1 = lasx_mul_f32(lasx_cvti32_f32(in1), vec_scale[i]); \
+  ACT_RELU_BIAS(dst_vec_ps0, vec_bias[i], relu_type)                  \
+  ACT_RELU_BIAS(dst_vec_ps1, vec_bias[i], relu_type)                  \
+  lasx_storeu_f32(c_ptr + i * ldc, dst_vec_ps0);                     \
+  lasx_storeu_f32(c_ptr + i * ldc + 8, dst_vec_ps1);
+
+#define STORE_8_float(in0, in1, in2, in3, i)                          \
+  dst_vec_ps0 = lasx_mul_f32(lasx_cvti32_f32(in0), vec_scale[i]); \
+  ACT_RELU_BIAS(dst_vec_ps0, vec_bias[i], relu_type)                  \
+  lasx_storeu_f32(c_ptr + i * ldc, dst_vec_ps0);
+
+// __m128
+#define STORE_4_float(in0, i)                                             \
+  {                                                                       \
+    dst_vec_ps0_128 = lsx_mul_f32(lsx_cvti32_f32(in0), vec_scale_128[i]); \
+    ACT_RELU_BIAS_128(dst_vec_ps0_128, vec_bias_128[i], relu_type)        \
+    lsx_storeu_f32(c_ptr + i * ldc, dst_vec_ps0_128);                      \
+  }
+
+#define STORE_2_float(in0, i)                                \
+  {                                                          \
+    int* in0_ptr = reinterpret_cast<int*>(&in0);             \
+    float bias_data = (*(bias_ptr + idx_m + i));             \
+    float in0_f32 = in0_ptr[0] * (*(scale_ptr + idx_m + i)); \
+    ACT_RELU_BIAS_FP32(in0_f32, bias_data, relu_type)        \
+    *(c_ptr + i * ldc) = in0_f32;                            \
+    in0_f32 = in0_ptr[1] * (*(scale_ptr + idx_m + i));       \
+    ACT_RELU_BIAS_FP32(in0_f32, bias_data, relu_type)        \
+    *(c_ptr + i * ldc + 1) = in0_f32;                        \
+  }
+
+void gemm_kernel_loop_int8(int M,
+                           int N,
+                           int K,
+                           int8_t* A,
+                           uint8_t* B,
+                           float* C,
+                           int ldc,
+                           const float* scale,
+                           const float* bias,
+                           int relu_type,
+                           float relu_alpha) {
+  int8_t* a_ptr = A;
+  float* c_ptr = C;
+  uint8_t* b_ptr = B;
+  const float* scale_ptr = scale;
+  const float* bias_ptr = bias;
+  int k_loop = (K + 3) >> 2;
+  int pack_k = k_loop << 2;
+  int idx_n = 0, idx_m = 0, idx_k = 0;
+
+  // total 16 regs
+  __m256i vec_C0, vec_C1, vec_C2, vec_C3;
+  __m256i vec_C4, vec_C5, vec_C6, vec_C7;
+  __m256i vec_B0, vec_B1, vec_B2, vec_B3;
+  __m256i vec_A0, vec_A1, vec_tmp;
+  __m256i vec_one_s16 = lasx_set1_i16(static_cast<int16_t>(1));
+  // save result
+  __m256 vec_bias[2];
+  __m256 vec_scale[2];
+  __m256 dst_vec_ps0, dst_vec_ps1, dst_vec_ps2, dst_vec_ps3;
+  // bias and relu
+  __m256 vec_alph = lasx_set1_f32(relu_alpha);
+  __m256 vec_zero = lasx_set1_f32(0.f);
+
+  __m128i vec_C0_128, vec_C1_128;
+  __m128i vec_B0_128;
+  __m128i vec_A0_128, vec_A1_128, vec_tmp_128;
+  __m128i vec_one_128 = lsx_set1_i16(static_cast<int16_t>(1));
+  // save result
+  __m128 vec_bias_128[2];
+  __m128 vec_scale_128[2];
+  __m128 dst_vec_ps0_128;
+  // bias and relu
+  __m128 vec_alph_128 = lsx_set1_f32(relu_alpha);
+  __m128 vec_zero_128 = lsx_set1_f32(0.f);
+
+  // block A
+  for (idx_m = 0; idx_m + 1 < M; idx_m += 2) {
+    c_ptr = C;
+    b_ptr = B;
+    a_ptr = A;
+    C += 2 * ldc;
+
+    // bias and scale
+    vec_bias[0] = lasx_set1_f32(*(bias_ptr + idx_m));
+    vec_bias[1] = lasx_set1_f32(*(bias_ptr + idx_m + 1));
+    vec_scale[0] = lasx_set1_f32(*(scale_ptr + idx_m));
+    vec_scale[1] = lasx_set1_f32(*(scale_ptr + idx_m + 1));
+    vec_bias_128[0] = lsx_set1_f32(*(bias_ptr + idx_m));
+    vec_bias_128[1] = lsx_set1_f32(*(bias_ptr + idx_m + 1));
+    vec_scale_128[0] = lsx_set1_f32(*(scale_ptr + idx_m));
+    vec_scale_128[1] = lsx_set1_f32(*(scale_ptr + idx_m + 1));
+
+    // block B
+    for (idx_n = 0; idx_n + 31 < N; idx_n += 32) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x32
+      }
+      STORE_32_float(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+          STORE_32_float(vec_C4, vec_C5, vec_C6, vec_C7, 1) c_ptr += 32;
+    }
+    for (; idx_n + 23 < N; idx_n += 24) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x24
+      }
+      STORE_24_float(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+          STORE_24_float(vec_C4, vec_C5, vec_C6, vec_C7, 1) c_ptr += 24;
+    }
+    for (; idx_n + 15 < N; idx_n += 16) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x16
+      }
+      STORE_16_float(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+          STORE_16_float(vec_C4, vec_C5, vec_C6, vec_C7, 1) c_ptr += 16;
+    }
+    for (; idx_n + 7 < N; idx_n += 8) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x8
+      }
+      STORE_8_float(vec_C0, vec_C1, vec_C2, vec_C3, 0)
+          STORE_8_float(vec_C4, vec_C5, vec_C6, vec_C7, 1) c_ptr += 8;
+    }
+    for (; idx_n + 3 < N; idx_n += 4) {
+      a_ptr = A;
+      INIT_C_128
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x4
+      }
+      STORE_4_float(vec_C0_128, 0) STORE_4_float(vec_C1_128, 1) c_ptr += 4;
+    }
+    for (; idx_n + 1 < N; idx_n += 2) {
+      a_ptr = A;
+      INIT_C_128
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_2x2
+      }
+      STORE_2_float(vec_C0_128, 0) STORE_2_float(vec_C1_128, 1) c_ptr += 2;
+    }
+    for (; idx_n < N; idx_n++) {
+      a_ptr = A;
+      float acc0 = 0;
+      float acc1 = 0;
+      float bias0 = (*(bias_ptr + idx_m));
+      float bias1 = (*(bias_ptr + idx_m + 1));
+      float scale0 = (*(scale_ptr + idx_m));
+      float scale1 = (*(scale_ptr + idx_m + 1));
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        for (int k = 0; k < 4; k++) {
+          acc0 +=
+              static_cast<int>(a_ptr[k]) * static_cast<int>(b_ptr[k]) * scale0;
+          acc1 += static_cast<int>(a_ptr[k + 4]) * static_cast<int>(b_ptr[k]) *
+                  scale1;
+        }
+        a_ptr += 2 * 4;
+        b_ptr += 4;
+      }
+      ACT_RELU_BIAS_FP32(acc0, bias0, relu_type)
+      ACT_RELU_BIAS_FP32(acc1, bias1, relu_type)
+      c_ptr[0] = acc0;
+      c_ptr[ldc] = acc1;
+      c_ptr++;
+    }
+    A += 2 * pack_k;
+  }
+  for (; idx_m < M; idx_m += 1) {
+    c_ptr = C;
+    b_ptr = B;
+    a_ptr = A;
+    C += ldc;
+
+    // bias and scale
+    vec_bias[0] = lasx_set1_f32(*(bias_ptr + idx_m));
+    vec_scale[0] = lasx_set1_f32(*(scale_ptr + idx_m));
+    vec_bias_128[0] = lsx_set1_f32(*(bias_ptr + idx_m));
+    vec_scale_128[0] = lsx_set1_f32(*(scale_ptr + idx_m));
+
+    // block B
+    for (idx_n = 0; idx_n + 31 < N; idx_n += 32) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x32
+      }
+      STORE_32_float(vec_C0, vec_C1, vec_C2, vec_C3, 0) c_ptr += 32;
+    }
+    for (; idx_n + 23 < N; idx_n += 24) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x24
+      }
+      STORE_24_float(vec_C0, vec_C1, vec_C2, vec_C3, 0) c_ptr += 24;
+    }
+    for (; idx_n + 15 < N; idx_n += 16) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x16
+      }
+      STORE_16_float(vec_C0, vec_C1, vec_C2, vec_C3, 0) c_ptr += 16;
+    }
+    for (; idx_n + 7 < N; idx_n += 8) {
+      a_ptr = A;
+      INIT_C
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x8
+      }
+      STORE_8_float(vec_C0, vec_C1, vec_C2, vec_C3, 0) c_ptr += 8;
+    }
+    for (; idx_n + 3 < N; idx_n += 4) {
+      a_ptr = A;
+      INIT_C_128
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x4
+      }
+      STORE_4_float(vec_C0_128, 0) c_ptr += 4;
+    }
+    for (; idx_n + 1 < N; idx_n += 2) {
+      a_ptr = A;
+      INIT_C_128
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        KERN_1x2
+      }
+      STORE_2_float(vec_C0_128, 0) c_ptr += 2;
+    }
+    for (; idx_n < N; idx_n++) {
+      a_ptr = A;
+      float acc0 = 0;
+      float bias0 = (*(bias_ptr + idx_m));
+      float scale0 = (*(scale_ptr + idx_m));
+      for (idx_k = 0; idx_k < k_loop; idx_k++) {
+        for (int k = 0; k < 4; k++) {
+          acc0 +=
+              static_cast<int>(a_ptr[k]) * static_cast<int>(b_ptr[k]) * scale0;
+        }
+        a_ptr += 4;
+        b_ptr += 4;
+      }
+      ACT_RELU_BIAS_FP32(acc0, bias0, relu_type)
+      c_ptr[0] = acc0;
+      c_ptr++;
+    }
+    A += pack_k;
+  }
+}
+
+#undef ACT_RELU_BIAS
+#undef ACT_RELU_BIAS_128
+#undef ACT_RELU_BIAS_FP32
+#undef CLIP_BORDER_LEFT
+#undef CLIP_BORDER_RIGHT
+#undef CLIP_S8
+#undef FLOAT2INT
+#undef LASX_DOT_U8S8
+#undef LSX_DOT_U8S8
+#undef INT32x32_2_INT8x32
+#undef SET_A
+#undef SET_A_128
+#undef LOAD_B
+#undef LOAD_B_128
+#undef SUDOT
+#undef SUDOT_128
+#undef INIT_C
+#undef INIT_C_128
+#undef KERN_2x32
+#undef KERN_1x32
+#undef KERN_2x24
+#undef KERN_1x24
+#undef KERN_2x16
+#undef KERN_1x16
+#undef KERN_2x8
+#undef KERN_1x8
+#undef KERN_2x4
+#undef KERN_1x4
+#undef KERN_2x2
+#undef KERN_1x2
+#undef STORE_32
+#undef STORE_24
+#undef STORE_16
+#undef STORE_8
+#undef STORE_4
+#undef STORE_2
+#undef STORE_32_float
+#undef STORE_24_float
+#undef STORE_16_float
+#undef STORE_8_float
+#undef STORE_4_float
+#undef STORE_2_float
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
+
+#endif  // __loongarch_asx
diff --git a/lite/backends/loongarch/math/gemm_s8u8_kernel.h b/lite/backends/loongarch/math/gemm_s8u8_kernel.h
new file mode 100644
index 00000000000..b08ce50324d
--- /dev/null
+++ b/lite/backends/loongarch/math/gemm_s8u8_kernel.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void gemm_kernel_loop_int8(int M,
+                           int N,
+                           int K,
+                           int8_t* A,
+                           uint8_t* B,
+                           int8_t* C,
+                           int ldc,
+                           const float* scale,
+                           const float* bias,
+                           int relu_type,
+                           float relu_alpha);
+
+void gemm_kernel_loop_int8(int M,
+                           int N,
+                           int K,
+                           int8_t* A,
+                           uint8_t* B,
+                           float* C,
+                           int ldc,
+                           const float* scale,
+                           const float* bias,
+                           int relu_type,
+                           float relu_alpha);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/gemm_s8u8_pack.cc b/lite/backends/loongarch/math/gemm_s8u8_pack.cc
new file mode 100644
index 00000000000..7394936b21d
--- /dev/null
+++ b/lite/backends/loongarch/math/gemm_s8u8_pack.cc
@@ -0,0 +1,1191 @@
+/* Copyright (c) 2021 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/gemm_s8u8_pack.h"
+#include "lite/backends/loongarch/xxl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+
+#ifndef _MSC_VER
+typedef long long int __int64;  // NOLINT
+#endif
+
+// PrePack A
+#define TRANSPOSEA_4x16                                            \
+  vec_12 = lsx_unpacklo_i8(vec_line[0], vec_line[1]);            \
+  vec_23 = lsx_unpacklo_i8(vec_line[2], vec_line[3]);            \
+  vec_out = lsx_unpacklo_i16(vec_12, vec_23);                    \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr),                \
+                lsx_castm128i_f32(vec_out));                        \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 2),  \
+                lsx_castm128i_f32(lsx_srli_m128i(vec_out, 8)));     \
+  vec_out = lsx_unpackhi_i16(vec_12, vec_23);                    \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 4),  \
+                lsx_castm128i_f32(vec_out));                        \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 6),  \
+                lsx_castm128i_f32(lsx_srli_m128i(vec_out, 8)));     \
+  vec_12 = lsx_unpackhi_i8(vec_line[0], vec_line[1]);            \
+  vec_23 = lsx_unpackhi_i8(vec_line[2], vec_line[3]);            \
+  vec_out = lsx_unpacklo_i16(vec_12, vec_23);                    \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 8),  \
+                lsx_castm128i_f32(vec_out));                        \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 10), \
+                lsx_castm128i_f32(lsx_srli_m128i(vec_out, 8)));     \
+  vec_out = lsx_unpackhi_i16(vec_12, vec_23);                    \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 12), \
+                lsx_castm128i_f32(vec_out));                        \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 14), \
+                lsx_castm128i_f32(lsx_srli_m128i(vec_out, 8)));
+
+#define TRANSPOSEA_4x8                                            \
+  vec_12 = lsx_unpacklo_i8(vec_line[0], vec_line[1]);           \
+  vec_23 = lsx_unpacklo_i8(vec_line[2], vec_line[3]);           \
+  vec_out = lsx_unpacklo_i16(vec_12, vec_23);                   \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr),               \
+                lsx_castm128i_f32(vec_out));                       \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 2), \
+                lsx_castm128i_f32(lsx_srli_m128i(vec_out, 8)));    \
+  vec_out = lsx_unpackhi_i16(vec_12, vec_23);                   \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 4), \
+                lsx_castm128i_f32(vec_out));                       \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 6), \
+                lsx_castm128i_f32(lsx_srli_m128i(vec_out, 8)));
+
+#define TRANSPOSEA_4x4                                            \
+  vec_12 = lsx_unpacklo_i8(vec_line[0], vec_line[1]);           \
+  vec_23 = lsx_unpacklo_i8(vec_line[2], vec_line[3]);           \
+  vec_out = lsx_unpacklo_i16(vec_12, vec_23);                   \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr),               \
+                lsx_castm128i_f32(vec_out));                       \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr + K_align * 2), \
+                lsx_castm128i_f32(lsx_srli_m128i(vec_out, 8)));
+
+#define TRANSPOSEA_4x2                                  \
+  vec_12 = lsx_unpacklo_i8(vec_line[0], vec_line[1]); \
+  vec_23 = lsx_unpacklo_i8(vec_line[2], vec_line[3]); \
+  vec_out = lsx_unpacklo_i16(vec_12, vec_23);         \
+  lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr), lsx_castm128i_f32(vec_out));
+
+#ifdef __loongarch_asx
+
+// runtime Pack B
+/*
+Attention:
+1. B need to add 128 during packing, transfering from int8 to uint8.
+2. B has transpose mode as well.
+3. K is 4-aligned after packing.
+4. don't forget to minus 128 by bias_data.
+*/
+// No Trans
+#define INT8_ADD_128(in, vec_128_s16)                                        \
+  {                                                                          \
+    __m256i in_lo = lasx_adds_i16(                                       \
+        vec_128_s16, lasx_cvti8_i16(lasx_castm256i_m128i(in)));      \
+    __m256i in_hi = lasx_adds_i16(                                       \
+        vec_128_s16, lasx_cvti8_i16(lasx_extracti128_m256i(in, 1))); \
+    in_lo = lasx_packus_i16(in_lo, in_hi);                               \
+    in = lasx_permute4x64_i64(in_lo, 216);                               \
+  }
+
+#define TRANSPOSE_4x32                                              \
+  vec_l01 = lasx_unpacklo_i8(vec_line0, vec_line1);             \
+  vec_l23 = lasx_unpacklo_i8(vec_line2, vec_line3);             \
+  vec_h01 = lasx_unpackhi_i8(vec_line0, vec_line1);             \
+  vec_h23 = lasx_unpackhi_i8(vec_line2, vec_line3);             \
+  vec_l03 = lasx_unpacklo_i16(vec_l01, vec_l23);                \
+  vec_h03 = lasx_unpackhi_i16(vec_l01, vec_l23);                \
+  vec_l03_1 = lasx_unpacklo_i16(vec_h01, vec_h23);              \
+  vec_h03_1 = lasx_unpackhi_i16(vec_h01, vec_h23);              \
+  vec_out0 = lasx_permute2x128_m256i(vec_l03, vec_h03, 0x20);     \
+  INT8_ADD_128(vec_out0, vec_128_s16)                               \
+  vec_out1 = lasx_permute2x128_m256i(vec_l03_1, vec_h03_1, 0x20); \
+  INT8_ADD_128(vec_out1, vec_128_s16)                               \
+  vec_out2 = lasx_permute2x128_m256i(vec_l03, vec_h03, 0x31);     \
+  INT8_ADD_128(vec_out2, vec_128_s16)                               \
+  vec_out3 = lasx_permute2x128_m256i(vec_l03_1, vec_h03_1, 0x31); \
+  INT8_ADD_128(vec_out3, vec_128_s16)
+
+#define STORE_4x32                                                          \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr), vec_out0);      \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr + 32), vec_out1); \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr + 64), vec_out2); \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr + 96), vec_out3); \
+  out_ptr += 32 * 4;
+
+#define STORE_4x24                                                          \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr), vec_out0);      \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr + 32), vec_out1); \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr + 64), vec_out2); \
+  out_ptr += 24 * 4;
+
+#define STORE_4x16                                                          \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr), vec_out0);      \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr + 32), vec_out1); \
+  out_ptr += 16 * 4;
+
+#define STORE_4x8                                                      \
+  lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr), vec_out0); \
+  out_ptr += 8 * 4;
+
+#define STORE_4x4                                        \
+  lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr), \
+                   lasx_castm256i_m128i(vec_out0));    \
+  out_ptr += 4 * 4;
+
+#define STORE_4x2                                                      \
+  {                                                                    \
+    lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr),                  \
+                  lsx_castm128i_f32(lasx_castm256i_m128i(vec_out0))); \
+    out_ptr += 2 * 4;                                                  \
+  }
+
+#define STORE_4x1                                                     \
+  {                                                                   \
+    lsx_store_1f32(reinterpret_cast<float *>(out_ptr),                  \
+                 lsx_castm128i_f32(lasx_castm256i_m128i(vec_out0))); \
+    out_ptr += 4;                                                     \
+  }
+
+#define LOAD_32                                                             \
+  vec_line0 = lasx_loadu_m256i(                                           \
+      reinterpret_cast<const __m256i *>(b_ptr + loop_k * stride + loop_n)); \
+  vec_line1 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>(         \
+      b_ptr + (loop_k + 1) * stride + loop_n));                             \
+  vec_line2 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>(         \
+      b_ptr + (loop_k + 2) * stride + loop_n));                             \
+  vec_line3 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>(         \
+      b_ptr + (loop_k + 3) * stride + loop_n));
+
+#define LOAD_EPI32(num)                                                      \
+  vec_line0 = lasx_maskload_i32(                                         \
+      reinterpret_cast<const int *>(b_ptr + loop_k * stride + loop_n),       \
+      vec_mask_##num);                                                       \
+  vec_line1 = lasx_maskload_i32(                                         \
+      reinterpret_cast<const int *>(b_ptr + (loop_k + 1) * stride + loop_n), \
+      vec_mask_##num);                                                       \
+  vec_line2 = lasx_maskload_i32(                                         \
+      reinterpret_cast<const int *>(b_ptr + (loop_k + 2) * stride + loop_n), \
+      vec_mask_##num);                                                       \
+  vec_line3 = lasx_maskload_i32(                                         \
+      reinterpret_cast<const int *>(b_ptr + (loop_k + 3) * stride + loop_n), \
+      vec_mask_##num);
+
+#define LOAD_EPI64(num)                                                    \
+  vec_line0 = lasx_maskload_i64(                                       \
+      reinterpret_cast<const __int64 *>(b_ptr + loop_k * stride + loop_n), \
+      vec_mask_##num);                                                     \
+  vec_line1 =                                                              \
+      lasx_maskload_i64(reinterpret_cast<const __int64 *>(             \
+                                b_ptr + (loop_k + 1) * stride + loop_n),   \
+                            vec_mask_##num);                               \
+  vec_line2 =                                                              \
+      lasx_maskload_i64(reinterpret_cast<const __int64 *>(             \
+                                b_ptr + (loop_k + 2) * stride + loop_n),   \
+                            vec_mask_##num);                               \
+  vec_line3 =                                                              \
+      lasx_maskload_i64(reinterpret_cast<const __int64 *>(             \
+                                b_ptr + (loop_k + 3) * stride + loop_n),   \
+                            vec_mask_##num);
+
+#define LOAD_REMAIN(remain)                                             \
+  switch (remain) {                                                     \
+    case 1:                                                             \
+      vec_line0 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>( \
+          b_ptr + loop_k * stride + loop_n));                           \
+      vec_line1 = lasx_setzero_m256i();                               \
+      vec_line2 = lasx_setzero_m256i();                               \
+      vec_line3 = lasx_setzero_m256i();                               \
+      break;                                                            \
+    case 2:                                                             \
+      vec_line0 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>( \
+          b_ptr + loop_k * stride + loop_n));                           \
+      vec_line1 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>( \
+          b_ptr + (loop_k + 1) * stride + loop_n));                     \
+      vec_line2 = lasx_setzero_m256i();                               \
+      vec_line3 = lasx_setzero_m256i();                               \
+      break;                                                            \
+    case 3:                                                             \
+      vec_line0 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>( \
+          b_ptr + loop_k * stride + loop_n));                           \
+      vec_line1 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>( \
+          b_ptr + (loop_k + 1) * stride + loop_n));                     \
+      vec_line2 = lasx_loadu_m256i(reinterpret_cast<const __m256i *>( \
+          b_ptr + (loop_k + 2) * stride + loop_n));                     \
+      vec_line3 = lasx_setzero_m256i();                               \
+      break;                                                            \
+    case 0:                                                             \
+      vec_line0 = lasx_setzero_m256i();                               \
+      vec_line1 = lasx_setzero_m256i();                               \
+      vec_line2 = lasx_setzero_m256i();                               \
+      vec_line3 = lasx_setzero_m256i();                               \
+      break;                                                            \
+    default:                                                            \
+      break;                                                            \
+  }
+
+#define LOAD_REMAIN_EPI64(remain, num)                                         \
+  switch (remain) {                                                            \
+    case 1:                                                                    \
+      vec_line0 = lasx_maskload_i64(                                       \
+          reinterpret_cast<const __int64 *>(b_ptr + loop_k * stride + loop_n), \
+          vec_mask_##num);                                                     \
+      vec_line1 = lasx_setzero_m256i();                                      \
+      vec_line2 = lasx_setzero_m256i();                                      \
+      vec_line3 = lasx_setzero_m256i();                                      \
+      break;                                                                   \
+    case 2:                                                                    \
+      vec_line0 = lasx_maskload_i64(                                       \
+          reinterpret_cast<const __int64 *>(b_ptr + loop_k * stride + loop_n), \
+          vec_mask_##num);                                                     \
+      vec_line1 =                                                              \
+          lasx_maskload_i64(reinterpret_cast<const __int64 *>(             \
+                                    b_ptr + (loop_k + 1) * stride + loop_n),   \
+                                vec_mask_##num);                               \
+      vec_line2 = lasx_setzero_m256i();                                      \
+      vec_line3 = lasx_setzero_m256i();                                      \
+      break;                                                                   \
+    case 3:                                                                    \
+      vec_line0 = lasx_maskload_i64(                                       \
+          reinterpret_cast<const __int64 *>(b_ptr + loop_k * stride + loop_n), \
+          vec_mask_##num);                                                     \
+      vec_line1 =                                                              \
+          lasx_maskload_i64(reinterpret_cast<const __int64 *>(             \
+                                    b_ptr + (loop_k + 1) * stride + loop_n),   \
+                                vec_mask_##num);                               \
+      vec_line2 =                                                              \
+          lasx_maskload_i64(reinterpret_cast<const __int64 *>(             \
+                                    b_ptr + (loop_k + 2) * stride + loop_n),   \
+                                vec_mask_##num);                               \
+      vec_line3 = lasx_setzero_m256i();                                      \
+      break;                                                                   \
+    default:                                                                   \
+      break;                                                                   \
+  }
+
+#define LOAD_REMAIN_EPI32(remain, num)                                       \
+  switch (remain) {                                                          \
+    case 1:                                                                  \
+      vec_line0 = lasx_maskload_i32(                                     \
+          reinterpret_cast<const int *>(b_ptr + loop_k * stride + loop_n),   \
+          vec_mask_##num);                                                   \
+      vec_line1 = lasx_setzero_m256i();                                    \
+      vec_line2 = lasx_setzero_m256i();                                    \
+      vec_line3 = lasx_setzero_m256i();                                    \
+      break;                                                                 \
+    case 2:                                                                  \
+      vec_line0 = lasx_maskload_i32(                                     \
+          reinterpret_cast<const int *>(b_ptr + loop_k * stride + loop_n),   \
+          vec_mask_##num);                                                   \
+      vec_line1 =                                                            \
+          lasx_maskload_i32(reinterpret_cast<const int *>(               \
+                                    b_ptr + (loop_k + 1) * stride + loop_n), \
+                                vec_mask_##num);                             \
+      vec_line2 = lasx_setzero_m256i();                                    \
+      vec_line3 = lasx_setzero_m256i();                                    \
+      break;                                                                 \
+    case 3:                                                                  \
+      vec_line0 = lasx_maskload_i32(                                     \
+          reinterpret_cast<const int *>(b_ptr + loop_k * stride + loop_n),   \
+          vec_mask_##num);                                                   \
+      vec_line1 =                                                            \
+          lasx_maskload_i32(reinterpret_cast<const int *>(               \
+                                    b_ptr + (loop_k + 1) * stride + loop_n), \
+                                vec_mask_##num);                             \
+      vec_line2 =                                                            \
+          lasx_maskload_i32(reinterpret_cast<const int *>(               \
+                                    b_ptr + (loop_k + 2) * stride + loop_n), \
+                                vec_mask_##num);                             \
+      vec_line3 = lasx_setzero_m256i();                                    \
+      break;                                                                 \
+    default:                                                                 \
+      break;                                                                 \
+  }
+
+void packB_i82u8_notrans(
+    int N, int K, int stride, const int8_t *B, uint8_t *pack_B) {
+  int loop_n = 0;
+  int loop_k = 0;
+  int remain_k = 0;
+  int k_align4 = 0;
+  int8_t *b_ptr = const_cast<int8_t *>(B);
+  uint8_t *out_ptr = pack_B;
+
+  __m256i vec_line0, vec_line1, vec_line2, vec_line3;
+  __m256i vec_l01, vec_l23, vec_h01, vec_h23;
+  __m256i vec_l03, vec_h03, vec_l03_1, vec_h03_1;
+  __m256i vec_out0, vec_out1, vec_out2, vec_out3;
+  __m256i vec_128_s16 =
+      lasx_set1_i16(static_cast<int16_t>(TRANS_INT8_UINT8_OFFT));
+
+  // mask load, store
+  __m256i vec_mask_24, vec_mask_16, vec_mask_8, vec_mask_4;
+  int64_t mask0[4] = {-1, -1, -1, 0};
+  int mask1[8] = {-1, 0, 0, 0, 0, 0, 0, 0};
+  vec_mask_24 = lasx_loadu_m256i(reinterpret_cast<__m256i const *>(mask0));
+  mask0[2] = static_cast<int64_t>(0);
+  vec_mask_16 = lasx_loadu_m256i(reinterpret_cast<__m256i const *>(mask0));
+  mask0[1] = static_cast<int64_t>(0);
+  vec_mask_8 = lasx_loadu_m256i(reinterpret_cast<__m256i const *>(mask0));
+  vec_mask_4 = lasx_loadu_m256i(reinterpret_cast<__m256i const *>(mask1));
+
+  int8_t *vec_ptr[4];
+  vec_ptr[0] = reinterpret_cast<int8_t *>(&vec_line0);
+  vec_ptr[1] = reinterpret_cast<int8_t *>(&vec_line1);
+  vec_ptr[2] = reinterpret_cast<int8_t *>(&vec_line2);
+  vec_ptr[3] = reinterpret_cast<int8_t *>(&vec_line3);
+
+  k_align4 = ((K + 3) / 4);
+  k_align4 = k_align4 * 4;
+
+  for (loop_n = 0; loop_n + 31 < N; loop_n += 32) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      LOAD_32;
+      TRANSPOSE_4x32;
+      STORE_4x32;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      LOAD_REMAIN(remain_k);
+      TRANSPOSE_4x32;
+      STORE_4x32;
+    }
+  }
+  for (; loop_n + 23 < N; loop_n += 24) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      LOAD_EPI64(24);
+      TRANSPOSE_4x32;
+      STORE_4x24;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      LOAD_REMAIN_EPI64(remain_k, 24);
+      TRANSPOSE_4x32;
+      STORE_4x24;
+    }
+  }
+  for (; loop_n + 15 < N; loop_n += 16) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      LOAD_EPI64(16);
+      TRANSPOSE_4x32;
+      STORE_4x16;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      LOAD_REMAIN_EPI64(remain_k, 16);
+      TRANSPOSE_4x32;
+      STORE_4x16;
+    }
+  }
+  for (; loop_n + 7 < N; loop_n += 8) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      LOAD_EPI64(8);
+      TRANSPOSE_4x32;
+      STORE_4x8;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      LOAD_REMAIN_EPI64(remain_k, 8);
+      TRANSPOSE_4x32;
+      STORE_4x8;
+    }
+  }
+  for (; loop_n + 3 < N; loop_n += 4) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      LOAD_EPI32(4);
+      TRANSPOSE_4x32;
+      STORE_4x4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      LOAD_REMAIN_EPI32(remain_k, 4);
+      TRANSPOSE_4x32;
+      STORE_4x4;
+    }
+  }
+  for (; loop_n + 1 < N; loop_n += 2) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      LOAD_REMAIN(0);
+      vec_ptr[0][0] = *(b_ptr + loop_k * stride + loop_n);
+      vec_ptr[0][1] = *(b_ptr + loop_k * stride + loop_n + 1);
+      vec_ptr[1][0] = *(b_ptr + (loop_k + 1) * stride + loop_n);
+      vec_ptr[1][1] = *(b_ptr + (loop_k + 1) * stride + loop_n + 1);
+      vec_ptr[2][0] = *(b_ptr + (loop_k + 2) * stride + loop_n);
+      vec_ptr[2][1] = *(b_ptr + (loop_k + 2) * stride + loop_n + 1);
+      vec_ptr[3][0] = *(b_ptr + (loop_k + 3) * stride + loop_n);
+      vec_ptr[3][1] = *(b_ptr + (loop_k + 3) * stride + loop_n + 1);
+      TRANSPOSE_4x32;
+      STORE_4x2;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      LOAD_REMAIN(0);
+      for (int i = 0; i < remain_k; i++) {
+        vec_ptr[i][0] = *(b_ptr + (loop_k + i) * stride + loop_n);
+        vec_ptr[i][1] = *(b_ptr + (loop_k + i) * stride + loop_n + 1);
+      }
+      TRANSPOSE_4x32;
+      STORE_4x2;
+    }
+  }
+  for (; loop_n < N; loop_n++) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      LOAD_REMAIN(0);
+      vec_ptr[0][0] = *(b_ptr + loop_k * stride + loop_n);
+      vec_ptr[1][0] = *(b_ptr + (loop_k + 1) * stride + loop_n);
+      vec_ptr[2][0] = *(b_ptr + (loop_k + 2) * stride + loop_n);
+      vec_ptr[3][0] = *(b_ptr + (loop_k + 3) * stride + loop_n);
+      TRANSPOSE_4x32;
+      STORE_4x1;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      LOAD_REMAIN(0);
+      for (int i = 0; i < remain_k; i++) {
+        vec_ptr[i][0] = *(b_ptr + (loop_k + i) * stride + loop_n);
+      }
+      TRANSPOSE_4x32;
+      STORE_4x1;
+    }
+  }
+}
+
+// TRANS
+// in0: __m128i  in1: __m256i
+#define INT8_ADD_128_HALF(in, vec_128_s16)                                     \
+  {                                                                            \
+    __m256i in_256 = lasx_adds_i16(vec_128_s16, lasx_cvti8_i16(in)); \
+    __m128i in_lo = lasx_castm256i_m128i(in_256);                            \
+    __m128i in_hi = lasx_extractf128_m256i(in_256, 1);                       \
+    in = lsx_packus_i16(in_lo, in_hi);                                       \
+  }
+
+#define TRANSPOSE_STORE_4x16(out_offt, stride)                           \
+  lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*16), \
+                   veci_line[0]);                                        \
+  lsx_storeu_m128i(                                                      \
+      reinterpret_cast<__m128i *>(out_ptr + stride + (out_offt)*16),     \
+      veci_line[1]);                                                     \
+  lsx_storeu_m128i(                                                      \
+      reinterpret_cast<__m128i *>(out_ptr + stride * 2 + (out_offt)*16), \
+      veci_line[2]);                                                     \
+  lsx_storeu_m128i(                                                      \
+      reinterpret_cast<__m128i *>(out_ptr + stride * 3 + (out_offt)*16), \
+      veci_line[3]);
+
+#define TRANSPOSE_STORE_4x8(out_offt, stride)                            \
+  lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*16), \
+                   veci_line[0]);                                        \
+  lsx_storeu_m128i(                                                      \
+      reinterpret_cast<__m128i *>(out_ptr + stride + (out_offt)*16),     \
+      veci_line[1]);
+
+#define TRANSPOSE_STORE_2x16(out_offt)                                       \
+  lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*8),      \
+                   veci_line[0]);                                            \
+  lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*8 + 8),  \
+                   veci_line[1]);                                            \
+  lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*8 + 16), \
+                   veci_line[2]);                                            \
+  lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*8 + 24), \
+                   veci_line[3]);
+
+#define TRANSPOSE_STORE_2x8(out_offt)                                       \
+  lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*8),     \
+                   veci_line[0]);                                           \
+  lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*8 + 8), \
+                   veci_line[1]);
+
+#define TRANSPOSE_4x16(in_offt, out_offt, stride)                        \
+  vec_line[0] = lsx_loadu_f32(reinterpret_cast<float const *>(            \
+      b_ptr + step * ((in_offt) + 0) + loop_k));                         \
+  vec_line[1] = lsx_loadu_f32(reinterpret_cast<float const *>(            \
+      b_ptr + step * ((in_offt) + 1) + loop_k));                         \
+  vec_line[2] = lsx_loadu_f32(reinterpret_cast<float const *>(            \
+      b_ptr + step * ((in_offt) + 2) + loop_k));                         \
+  vec_line[3] = lsx_loadu_f32(reinterpret_cast<float const *>(            \
+      b_ptr + step * ((in_offt) + 3) + loop_k));                         \
+  LSX_TRANSPOSE4_S(vec_line[0], vec_line[1], vec_line[2], vec_line[3]); \
+  veci_line[0] = lsx_castf32_m128i(vec_line[0]);                          \
+  veci_line[1] = lsx_castf32_m128i(vec_line[1]);                          \
+  veci_line[2] = lsx_castf32_m128i(vec_line[2]);                          \
+  veci_line[3] = lsx_castf32_m128i(vec_line[3]);                          \
+  INT8_ADD_128_HALF(veci_line[0], vec_128_s16)                           \
+  INT8_ADD_128_HALF(veci_line[1], vec_128_s16)                           \
+  INT8_ADD_128_HALF(veci_line[2], vec_128_s16)                           \
+  INT8_ADD_128_HALF(veci_line[3], vec_128_s16)                           \
+  TRANSPOSE_STORE_4x16(out_offt, stride)
+
+#define TRANSPOSE_4x8(in_offt, out_offt, stride)                            \
+  vec_line[0] = lsx_loadl_pi(vecf_0,                                        \
+                             reinterpret_cast<__m64 const *>(               \
+                                 b_ptr + step * ((in_offt) + 0) + loop_k)); \
+  vec_line[1] = lsx_loadl_pi(vecf_0,                                        \
+                             reinterpret_cast<__m64 const *>(               \
+                                 b_ptr + step * ((in_offt) + 1) + loop_k)); \
+  vec_line[2] = lsx_loadl_pi(vecf_0,                                        \
+                             reinterpret_cast<__m64 const *>(               \
+                                 b_ptr + step * ((in_offt) + 2) + loop_k)); \
+  vec_line[3] = lsx_loadl_pi(vecf_0,                                        \
+                             reinterpret_cast<__m64 const *>(               \
+                                 b_ptr + step * ((in_offt) + 3) + loop_k)); \
+  LSX_TRANSPOSE4_S(vec_line[0], vec_line[1], vec_line[2], vec_line[3]);    \
+  veci_line[0] = lsx_castf32_m128i(vec_line[0]);                             \
+  veci_line[1] = lsx_castf32_m128i(vec_line[1]);                             \
+  INT8_ADD_128_HALF(veci_line[0], vec_128_s16)                              \
+  INT8_ADD_128_HALF(veci_line[1], vec_128_s16)                              \
+  TRANSPOSE_STORE_4x8(out_offt, stride)
+
+#define TRANSPOSE_4x4(in_offt, out_offt)                                     \
+  vec_line[0] = lsx_castm128i_f32(lsx_set1_i32(                             \
+      *(reinterpret_cast<int *>(b_ptr + step * ((in_offt) + 0) + loop_k)))); \
+  vec_line[1] = lsx_castm128i_f32(lsx_set1_i32(                             \
+      *(reinterpret_cast<int *>(b_ptr + step * ((in_offt) + 1) + loop_k)))); \
+  vec_line[2] = lsx_castm128i_f32(lsx_set1_i32(                             \
+      *(reinterpret_cast<int *>(b_ptr + step * ((in_offt) + 2) + loop_k)))); \
+  vec_line[3] = lsx_castm128i_f32(lsx_set1_i32(                             \
+      *(reinterpret_cast<int *>(b_ptr + step * ((in_offt) + 3) + loop_k)))); \
+  LSX_TRANSPOSE4_S(vec_line[0], vec_line[1], vec_line[2], vec_line[3]);     \
+  veci_line[0] = lsx_castf32_m128i(vec_line[0]);                              \
+  INT8_ADD_128_HALF(veci_line[0], vec_128_s16)                               \
+  lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*16),     \
+                   veci_line[0]);
+
+#define TRANSPOSE_4xX(num, in_offt, out_offt)                              \
+  {                                                                        \
+    vec_line[0] = lsx_set1_f32(0.f);                                        \
+    vec_line[1] = lsx_set1_f32(0.f);                                        \
+    vec_line[2] = lsx_set1_f32(0.f);                                        \
+    vec_line[3] = lsx_set1_f32(0.f);                                        \
+    int8_t *tmp0 = reinterpret_cast<int8_t *>(&vec_line[0]);               \
+    int8_t *tmp1 = reinterpret_cast<int8_t *>(&vec_line[1]);               \
+    int8_t *tmp2 = reinterpret_cast<int8_t *>(&vec_line[2]);               \
+    int8_t *tmp3 = reinterpret_cast<int8_t *>(&vec_line[3]);               \
+    for (int i = 0; i < num; i++) {                                        \
+      tmp0[i] = *(b_ptr + step * ((in_offt) + 0) + loop_k + i);            \
+      tmp1[i] = *(b_ptr + step * ((in_offt) + 1) + loop_k + i);            \
+      tmp2[i] = *(b_ptr + step * ((in_offt) + 2) + loop_k + i);            \
+      tmp3[i] = *(b_ptr + step * ((in_offt) + 3) + loop_k + i);            \
+    }                                                                      \
+    LSX_TRANSPOSE4_S(vec_line[0], vec_line[1], vec_line[2], vec_line[3]); \
+    veci_line[0] = lsx_castf32_m128i(vec_line[0]);                          \
+    INT8_ADD_128_HALF(veci_line[0], vec_128_s16)                           \
+    lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*16), \
+                     veci_line[0]);                                        \
+  }
+
+#define TRANSPOSE_2x16(in_offt, out_offt)                                \
+  vec_line[0] = lsx_loadu_f32(reinterpret_cast<float const *>(            \
+      b_ptr + step * ((in_offt) + 0) + loop_k));                         \
+  vec_line[1] = lsx_loadu_f32(reinterpret_cast<float const *>(            \
+      b_ptr + step * ((in_offt) + 1) + loop_k));                         \
+  LSX_TRANSPOSE4_S(vec_line[0], vec_line[1], vec_line[2], vec_line[3]); \
+  veci_line[0] = lsx_castf32_m128i(vec_line[0]);                          \
+  veci_line[1] = lsx_castf32_m128i(vec_line[1]);                          \
+  veci_line[2] = lsx_castf32_m128i(vec_line[2]);                          \
+  veci_line[3] = lsx_castf32_m128i(vec_line[3]);                          \
+  INT8_ADD_128_HALF(veci_line[0], vec_128_s16)                           \
+  INT8_ADD_128_HALF(veci_line[1], vec_128_s16)                           \
+  INT8_ADD_128_HALF(veci_line[2], vec_128_s16)                           \
+  INT8_ADD_128_HALF(veci_line[3], vec_128_s16)                           \
+  TRANSPOSE_STORE_2x16(out_offt)
+
+#define TRANSPOSE_2x8(in_offt, out_offt)                                    \
+  vec_line[0] = lsx_loadl_pi(vecf_0,                                        \
+                             reinterpret_cast<__m64 const *>(               \
+                                 b_ptr + step * ((in_offt) + 0) + loop_k)); \
+  vec_line[1] = lsx_loadl_pi(vecf_0,                                        \
+                             reinterpret_cast<__m64 const *>(               \
+                                 b_ptr + step * ((in_offt) + 1) + loop_k)); \
+  LSX_TRANSPOSE4_S(vec_line[0], vec_line[1], vec_line[2], vec_line[3]);    \
+  veci_line[0] = lsx_castf32_m128i(vec_line[0]);                             \
+  veci_line[1] = lsx_castf32_m128i(vec_line[1]);                             \
+  INT8_ADD_128_HALF(veci_line[0], vec_128_s16)                              \
+  INT8_ADD_128_HALF(veci_line[1], vec_128_s16)                              \
+  TRANSPOSE_STORE_2x8(out_offt)
+
+#define TRANSPOSE_2x4(in_offt, out_offt)                                     \
+  vec_line[0] = lsx_castm128i_f32(lsx_set1_i32(                             \
+      *(reinterpret_cast<int *>(b_ptr + step * ((in_offt) + 0) + loop_k)))); \
+  vec_line[1] = lsx_castm128i_f32(lsx_set1_i32(                             \
+      *(reinterpret_cast<int *>(b_ptr + step * ((in_offt) + 1) + loop_k)))); \
+  LSX_TRANSPOSE4_S(vec_line[0], vec_line[1], vec_line[2], vec_line[3]);     \
+  veci_line[0] = lsx_castf32_m128i(vec_line[0]);                              \
+  INT8_ADD_128_HALF(veci_line[0], vec_128_s16)                               \
+  lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*8),      \
+                   veci_line[0]);
+
+#define TRANSPOSE_2xX(num, in_offt, out_offt)                              \
+  {                                                                        \
+    vec_line[0] = lsx_set1_f32(0.f);                                        \
+    vec_line[1] = lsx_set1_f32(0.f);                                        \
+    int8_t *tmp0 = reinterpret_cast<int8_t *>(&vec_line[0]);               \
+    int8_t *tmp1 = reinterpret_cast<int8_t *>(&vec_line[1]);               \
+    for (int i = 0; i < num; i++) {                                        \
+      tmp0[i] = *(b_ptr + step * ((in_offt) + 0) + loop_k + i);            \
+      tmp1[i] = *(b_ptr + step * ((in_offt) + 1) + loop_k + i);            \
+    }                                                                      \
+    LSX_TRANSPOSE4_S(vec_line[0], vec_line[1], vec_line[2], vec_line[3]); \
+    veci_line[0] = lsx_castf32_m128i(vec_line[0]);                          \
+    INT8_ADD_128_HALF(veci_line[0], vec_128_s16)                           \
+    lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr + (out_offt)*8),  \
+                     veci_line[0]);                                        \
+  }
+
+void packB_i82u8_trans(
+    int N, int K, int step, const int8_t *B, uint8_t *pack_B) {
+  int loop_n = 0, loop_k = 0;
+  int remain_k = 0;
+  int8_t *b_ptr = const_cast<int8_t *>(B);
+  uint8_t *out_ptr = pack_B;
+  int k_align4 = ((K + 3) / 4);
+  k_align4 = k_align4 * 4;
+
+  __m128 vec_line[4] = {0};
+  __m128i veci_line[4] = {0};
+  __m128 vecf_0 = lsx_set1_f32(0.f);
+  __m256i vec_128_s16 =
+      lasx_set1_i16(static_cast<int16_t>(TRANS_INT8_UINT8_OFFT));
+
+  for (loop_n = 0; loop_n + 31 < N; loop_n += 32) {
+    for (loop_k = 0; loop_k + 15 < K; loop_k += 16) {
+      TRANSPOSE_4x16(loop_n, 0, 128);
+      TRANSPOSE_4x16((loop_n + 4), 1, 128);
+      TRANSPOSE_4x16((loop_n + 8), 2, 128);
+      TRANSPOSE_4x16((loop_n + 12), 3, 128);
+      TRANSPOSE_4x16((loop_n + 16), 4, 128);
+      TRANSPOSE_4x16((loop_n + 20), 5, 128);
+      TRANSPOSE_4x16((loop_n + 24), 6, 128);
+      TRANSPOSE_4x16((loop_n + 28), 7, 128);
+      out_ptr += 32 * 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      TRANSPOSE_4x8(loop_n, 0, 128);
+      TRANSPOSE_4x8((loop_n + 4), 1, 128);
+      TRANSPOSE_4x8((loop_n + 8), 2, 128);
+      TRANSPOSE_4x8((loop_n + 12), 3, 128);
+      TRANSPOSE_4x8((loop_n + 16), 4, 128);
+      TRANSPOSE_4x8((loop_n + 20), 5, 128);
+      TRANSPOSE_4x8((loop_n + 24), 6, 128);
+      TRANSPOSE_4x8((loop_n + 28), 7, 128);
+      out_ptr += 32 * 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      TRANSPOSE_4x4(loop_n, 0);
+      TRANSPOSE_4x4((loop_n + 4), 1);
+      TRANSPOSE_4x4((loop_n + 8), 2);
+      TRANSPOSE_4x4((loop_n + 12), 3);
+      TRANSPOSE_4x4((loop_n + 16), 4);
+      TRANSPOSE_4x4((loop_n + 20), 5);
+      TRANSPOSE_4x4((loop_n + 24), 6);
+      TRANSPOSE_4x4((loop_n + 28), 7);
+      out_ptr += 32 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      TRANSPOSE_4xX(remain_k, loop_n, 0);
+      TRANSPOSE_4xX(remain_k, (loop_n + 4), 1);
+      TRANSPOSE_4xX(remain_k, (loop_n + 8), 2);
+      TRANSPOSE_4xX(remain_k, (loop_n + 12), 3);
+      TRANSPOSE_4xX(remain_k, (loop_n + 16), 4);
+      TRANSPOSE_4xX(remain_k, (loop_n + 20), 5);
+      TRANSPOSE_4xX(remain_k, (loop_n + 24), 6);
+      TRANSPOSE_4xX(remain_k, (loop_n + 28), 7);
+      out_ptr += 32 * 4;
+    }
+  }
+  for (; loop_n + 23 < N; loop_n += 24) {
+    for (loop_k = 0; loop_k + 15 < K; loop_k += 16) {
+      TRANSPOSE_4x16(loop_n, 0, 96);
+      TRANSPOSE_4x16((loop_n + 4), 1, 96);
+      TRANSPOSE_4x16((loop_n + 8), 2, 96);
+      TRANSPOSE_4x16((loop_n + 12), 3, 96);
+      TRANSPOSE_4x16((loop_n + 16), 4, 96);
+      TRANSPOSE_4x16((loop_n + 20), 5, 96);
+      out_ptr += 24 * 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      TRANSPOSE_4x8(loop_n, 0, 96);
+      TRANSPOSE_4x8((loop_n + 4), 1, 96);
+      TRANSPOSE_4x8((loop_n + 8), 2, 96);
+      TRANSPOSE_4x8((loop_n + 12), 3, 96);
+      TRANSPOSE_4x8((loop_n + 16), 4, 96);
+      TRANSPOSE_4x8((loop_n + 20), 5, 96);
+      out_ptr += 24 * 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      TRANSPOSE_4x4(loop_n, 0);
+      TRANSPOSE_4x4((loop_n + 4), 1);
+      TRANSPOSE_4x4((loop_n + 8), 2);
+      TRANSPOSE_4x4((loop_n + 12), 3);
+      TRANSPOSE_4x4((loop_n + 16), 4);
+      TRANSPOSE_4x4((loop_n + 20), 5);
+      out_ptr += 24 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      TRANSPOSE_4xX(remain_k, loop_n, 0);
+      TRANSPOSE_4xX(remain_k, (loop_n + 4), 1);
+      TRANSPOSE_4xX(remain_k, (loop_n + 8), 2);
+      TRANSPOSE_4xX(remain_k, (loop_n + 12), 3);
+      TRANSPOSE_4xX(remain_k, (loop_n + 16), 4);
+      TRANSPOSE_4xX(remain_k, (loop_n + 20), 5);
+      out_ptr += 24 * 4;
+    }
+  }
+  for (; loop_n + 15 < N; loop_n += 16) {
+    for (loop_k = 0; loop_k + 15 < K; loop_k += 16) {
+      TRANSPOSE_4x16(loop_n, 0, 64);
+      TRANSPOSE_4x16((loop_n + 4), 1, 64);
+      TRANSPOSE_4x16((loop_n + 8), 2, 64);
+      TRANSPOSE_4x16((loop_n + 12), 3, 64);
+      out_ptr += 16 * 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      TRANSPOSE_4x8(loop_n, 0, 64);
+      TRANSPOSE_4x8((loop_n + 4), 1, 64);
+      TRANSPOSE_4x8((loop_n + 8), 2, 64);
+      TRANSPOSE_4x8((loop_n + 12), 3, 64);
+      out_ptr += 16 * 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      TRANSPOSE_4x4(loop_n, 0);
+      TRANSPOSE_4x4((loop_n + 4), 1);
+      TRANSPOSE_4x4((loop_n + 8), 2);
+      TRANSPOSE_4x4((loop_n + 12), 3);
+      out_ptr += 16 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      TRANSPOSE_4xX(remain_k, loop_n, 0);
+      TRANSPOSE_4xX(remain_k, (loop_n + 4), 1);
+      TRANSPOSE_4xX(remain_k, (loop_n + 8), 2);
+      TRANSPOSE_4xX(remain_k, (loop_n + 12), 3);
+      out_ptr += 16 * 4;
+    }
+  }
+  for (; loop_n + 7 < N; loop_n += 8) {
+    for (loop_k = 0; loop_k + 15 < K; loop_k += 16) {
+      TRANSPOSE_4x16(loop_n, 0, 32);
+      TRANSPOSE_4x16((loop_n + 4), 1, 32);
+      out_ptr += 8 * 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      TRANSPOSE_4x8(loop_n, 0, 32);
+      TRANSPOSE_4x8((loop_n + 4), 1, 32);
+      out_ptr += 8 * 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      TRANSPOSE_4x4(loop_n, 0);
+      TRANSPOSE_4x4((loop_n + 4), 1);
+      out_ptr += 8 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      TRANSPOSE_4xX(remain_k, loop_n, 0);
+      TRANSPOSE_4xX(remain_k, (loop_n + 4), 1);
+      out_ptr += 8 * 4;
+    }
+  }
+  for (; loop_n + 3 < N; loop_n += 4) {
+    for (loop_k = 0; loop_k + 15 < K; loop_k += 16) {
+      TRANSPOSE_4x16(loop_n, 0, 16);
+      out_ptr += 4 * 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      TRANSPOSE_4x8(loop_n, 0, 16);
+      out_ptr += 4 * 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      TRANSPOSE_4x4(loop_n, 0);
+      out_ptr += 4 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      TRANSPOSE_4xX(remain_k, loop_n, 0);
+      out_ptr += 4 * 4;
+    }
+  }
+  for (; loop_n + 1 < N; loop_n += 2) {
+    for (loop_k = 0; loop_k + 15 < K; loop_k += 16) {
+      TRANSPOSE_2x16(loop_n, 0);
+      out_ptr += 2 * 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      TRANSPOSE_2x8(loop_n, 0);
+      out_ptr += 2 * 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      TRANSPOSE_2x4(loop_n, 0);
+      out_ptr += 2 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      TRANSPOSE_2xX(remain_k, loop_n, 0);
+      out_ptr += 2 * 4;
+    }
+  }
+  for (; loop_n < N; loop_n++) {
+    for (loop_k = 0; loop_k + 15 < K; loop_k += 16) {
+      veci_line[0] = lsx_loadu_m128i(
+          reinterpret_cast<__m128i const *>(b_ptr + step * loop_n + loop_k));
+      INT8_ADD_128_HALF(veci_line[0], vec_128_s16)
+      lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr), veci_line[0]);
+      out_ptr += 1 * 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      veci_line[0] = lsx_set1_i64x(
+          *(reinterpret_cast<__int64 *>(b_ptr + step * loop_n + loop_k)));
+      INT8_ADD_128_HALF(veci_line[0], vec_128_s16)
+      lsx_storel_i64(reinterpret_cast<__m128i *>(out_ptr), veci_line[0]);
+      out_ptr += 1 * 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      veci_line[0] = lsx_set1_i32(
+          *(reinterpret_cast<int *>(b_ptr + step * loop_n + loop_k)));
+      INT8_ADD_128_HALF(veci_line[0], vec_128_s16)
+      lsx_store_1f32(reinterpret_cast<float *>(out_ptr),
+                   lsx_castm128i_f32(veci_line[0]));
+      out_ptr += 1 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      veci_line[0] = lsx_set1_i32(0);
+      int8_t *vec_tmp = reinterpret_cast<int8_t *>(&veci_line[0]);
+      for (int i = 0; i < remain_k; i++) {
+        vec_tmp[i] = *(b_ptr + step * loop_n + loop_k + i);
+      }
+      INT8_ADD_128_HALF(veci_line[0], vec_128_s16)
+      lsx_store_1f32(reinterpret_cast<float *>(out_ptr),
+                   lsx_castm128i_f32(veci_line[0]));
+      out_ptr += 1 * 4;
+    }
+  }
+}
+
+
+void gemm_s8u8s8_runpackB(
+    int N, int K, int stride, const int8_t *B, uint8_t *pack_B, bool is_trans) {
+  if (is_trans) {
+    packB_i82u8_trans(N, K, stride, B, pack_B);
+  } else {
+    packB_i82u8_notrans(N, K, stride, B, pack_B);
+  }
+}
+
+#endif  // __loongarch_asx
+
+// if K is not 4-aligned, need to pad zero
+void packA_i8_notrans(int M, int K, const int8_t *AA, int8_t *pack_A) {
+  int8_t *out_ptr = pack_A;
+  int loop_m = 0;
+  int loop_k = 0;
+  int remain_k = 0;
+  int8_t *A = const_cast<int8_t *>(AA);
+
+#ifdef __loongarch_asx
+  __m256i vec_line0, vec_line1, vec_lo, vec_hi;
+#endif
+  __m128i vec_line0_h, vec_line1_h, vec_lo_h, vec_hi_h;
+
+  for (loop_m = 0; loop_m + 1 < M; loop_m += 2) {
+    loop_k = 0;
+#ifdef __loongarch_asx
+    for (; loop_k + 31 < K; loop_k += 32) {
+      vec_line0 = lasx_loadu_m256i(
+          reinterpret_cast<__m256i const *>(A + loop_m * K + loop_k));
+      vec_line1 = lasx_loadu_m256i(
+          reinterpret_cast<__m256i const *>(A + (loop_m + 1) * K + loop_k));
+      vec_lo = lasx_unpacklo_i32(vec_line0, vec_line1);
+      vec_hi = lasx_unpackhi_i32(vec_line0, vec_line1);
+      lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr),
+                          lasx_permute2x128_m256i(vec_lo, vec_hi, 0x20));
+      lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr + 32),
+                          lasx_permute2x128_m256i(vec_lo, vec_hi, 0x31));
+      out_ptr += 2 * 32;
+    }
+#endif
+    for (; loop_k + 15 < K; loop_k += 16) {
+      vec_line0_h = lsx_loadu_m128i(
+          reinterpret_cast<__m128i const *>(A + loop_m * K + loop_k));
+      vec_line1_h = lsx_loadu_m128i(
+          reinterpret_cast<__m128i const *>(A + (loop_m + 1) * K + loop_k));
+      vec_lo_h = lsx_unpacklo_i32(vec_line0_h, vec_line1_h);
+      vec_hi_h = lsx_unpackhi_i32(vec_line0_h, vec_line1_h);
+      lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr), vec_lo_h);
+      lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr + 16), vec_hi_h);
+      out_ptr += 2 * 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      vec_line0_h = lsx_loadl_i64(
+          reinterpret_cast<__m128i const *>(A + loop_m * K + loop_k));
+      vec_line1_h = lsx_loadl_i64(
+          reinterpret_cast<__m128i const *>(A + (loop_m + 1) * K + loop_k));
+      vec_lo_h = lsx_unpacklo_i32(vec_line0_h, vec_line1_h);
+      lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr), vec_lo_h);
+      out_ptr += 2 * 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      vec_line0_h =
+          lsx_set1_i32(*(reinterpret_cast<int *>(A + loop_m * K + loop_k)));
+      vec_line1_h = lsx_set1_i32(
+          *(reinterpret_cast<int *>(A + (loop_m + 1) * K + loop_k)));
+      vec_lo_h = lsx_unpacklo_i32(vec_line0_h, vec_line1_h);
+      lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr),
+                    lsx_castm128i_f32(vec_lo_h));
+      out_ptr += 2 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      vec_line0_h = lsx_setzero_m128i();
+      vec_line1_h = lsx_setzero_m128i();
+      for (int i = 0; i < remain_k; i++) {
+        int8_t *tmp = reinterpret_cast<int8_t *>(&vec_line0_h);
+        tmp[i] = *(A + loop_m * K + loop_k + i);
+        tmp = reinterpret_cast<int8_t *>(&vec_line1_h);
+        tmp[i] = *(A + (loop_m + 1) * K + loop_k + i);
+      }
+      vec_lo_h = lsx_unpacklo_i32(vec_line0_h, vec_line1_h);
+      lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr),
+                    lsx_castm128i_f32(vec_lo_h));
+      out_ptr += 2 * 4;
+    }
+  }
+  for (; loop_m < M; loop_m++) {
+    loop_k = 0;
+#ifdef __loongarch_asx
+    for (; loop_k + 31 < K; loop_k += 32) {
+      vec_line0 = lasx_loadu_m256i(
+          reinterpret_cast<__m256i const *>(A + loop_m * K + loop_k));
+      lasx_storeu_m256i(reinterpret_cast<__m256i *>(out_ptr), vec_line0);
+      out_ptr += 32;
+    }
+#endif
+    for (; loop_k + 15 < K; loop_k += 16) {
+      vec_line0_h = lsx_loadu_m128i(
+          reinterpret_cast<__m128i const *>(A + loop_m * K + loop_k));
+      lsx_storeu_m128i(reinterpret_cast<__m128i *>(out_ptr), vec_line0_h);
+      out_ptr += 16;
+    }
+    for (; loop_k + 7 < K; loop_k += 8) {
+      vec_line0_h = lsx_loadl_i64(
+          reinterpret_cast<__m128i const *>(A + loop_m * K + loop_k));
+      lsx_storel_pi(reinterpret_cast<__m64 *>(out_ptr),
+                    lsx_castm128i_f32(vec_line0_h));
+      out_ptr += 8;
+    }
+    for (; loop_k + 3 < K; loop_k += 4) {
+      vec_line0_h =
+          lsx_set1_i32(*(reinterpret_cast<int *>(A + loop_m * K + loop_k)));
+      lsx_store_1f32(reinterpret_cast<float *>(out_ptr),
+                   lsx_castm128i_f32(vec_line0_h));
+      out_ptr += 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      vec_line0_h = lsx_setzero_m128i();
+      for (int i = 0; i < remain_k; i++) {
+        int8_t *tmp = reinterpret_cast<int8_t *>(&vec_line0_h);
+        tmp[i] = *(A + loop_m * K + loop_k + i);
+      }
+      lsx_store_1f32(reinterpret_cast<float *>(out_ptr),
+                   lsx_castm128i_f32(vec_line0_h));
+      out_ptr += 4;
+    }
+  }
+}
+
+#define ZERO_ALL                     \
+  vec_line[0] = lsx_setzero_m128i(); \
+  vec_line[1] = lsx_setzero_m128i(); \
+  vec_line[2] = lsx_setzero_m128i(); \
+  vec_line[3] = lsx_setzero_m128i();
+
+void packA_i8_trans(int M, int K, const int8_t *AA, int8_t *pack_A) {
+  int8_t *out_ptr = pack_A;
+  int loop_m = 0;
+  int loop_k = 0;
+  int remain_k = 0;
+  int K_align = 0;
+  int8_t *A = const_cast<int8_t *>(AA);
+
+  __m128i vec_12, vec_23, vec_out;
+  __m128i vec_line[4];
+
+  K_align = (K + 3) / 4;
+  K_align = K_align * 4;
+
+  for (loop_m = 0; loop_m + 15 < M; loop_m += 16) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      vec_line[0] = lsx_loadu_m128i(
+          reinterpret_cast<__m128i const *>(A + loop_k * M + loop_m));
+      vec_line[1] = lsx_loadu_m128i(
+          reinterpret_cast<__m128i const *>(A + (loop_k + 1) * M + loop_m));
+      vec_line[2] = lsx_loadu_m128i(
+          reinterpret_cast<__m128i const *>(A + (loop_k + 2) * M + loop_m));
+      vec_line[3] = lsx_loadu_m128i(
+          reinterpret_cast<__m128i const *>(A + (loop_k + 3) * M + loop_m));
+      TRANSPOSEA_4x16 out_ptr += 2 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      ZERO_ALL
+      for (int i = 0; i < remain_k; i++) {
+        vec_line[i] = lsx_loadu_m128i(
+            reinterpret_cast<__m128i const *>(A + (loop_k + i) * M + loop_m));
+      }
+      TRANSPOSEA_4x16 out_ptr += 2 * 4;
+    }
+    out_ptr += 14 * K_align;  // total 16 * K_align
+  }
+  for (; loop_m + 7 < M; loop_m += 8) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      vec_line[0] = lsx_loadl_i64(
+          reinterpret_cast<__m128i const *>(A + loop_k * M + loop_m));
+      vec_line[1] = lsx_loadl_i64(
+          reinterpret_cast<__m128i const *>(A + (loop_k + 1) * M + loop_m));
+      vec_line[2] = lsx_loadl_i64(
+          reinterpret_cast<__m128i const *>(A + (loop_k + 2) * M + loop_m));
+      vec_line[3] = lsx_loadl_i64(
+          reinterpret_cast<__m128i const *>(A + (loop_k + 3) * M + loop_m));
+      TRANSPOSEA_4x8 out_ptr += 2 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      ZERO_ALL
+      for (int i = 0; i < remain_k; i++) {
+        vec_line[i] = lsx_loadl_i64(
+            reinterpret_cast<__m128i const *>(A + (loop_k + i) * M + loop_m));
+      }
+      TRANSPOSEA_4x8 out_ptr += 2 * 4;
+    }
+    out_ptr += 6 * K_align;  // total 8 * K_align
+  }
+  for (; loop_m + 3 < M; loop_m += 4) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      vec_line[0] =
+          lsx_set1_i32(*(reinterpret_cast<int *>(A + loop_k * M + loop_m)));
+      vec_line[1] = lsx_set1_i32(
+          *(reinterpret_cast<int *>(A + (loop_k + 1) * M + loop_m)));
+      vec_line[2] = lsx_set1_i32(
+          *(reinterpret_cast<int *>(A + (loop_k + 2) * M + loop_m)));
+      vec_line[3] = lsx_set1_i32(
+          *(reinterpret_cast<int *>(A + (loop_k + 3) * M + loop_m)));
+      TRANSPOSEA_4x4 out_ptr += 2 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      ZERO_ALL
+      for (int i = 0; i < remain_k; i++) {
+        vec_line[i] = lsx_set1_i32(
+            *(reinterpret_cast<int *>(A + (loop_k + i) * M + loop_m)));
+      }
+      TRANSPOSEA_4x4 out_ptr += 2 * 4;
+    }
+    out_ptr += 2 * K_align;  // total 4 * K_align
+  }
+  for (; loop_m + 1 < M; loop_m += 2) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      vec_line[0] = lsx_set1_i16(
+          *(reinterpret_cast<int16_t *>(A + loop_k * M + loop_m)));
+      vec_line[1] = lsx_set1_i16(
+          *(reinterpret_cast<int16_t *>(A + (loop_k + 1) * M + loop_m)));
+      vec_line[2] = lsx_set1_i16(
+          *(reinterpret_cast<int16_t *>(A + (loop_k + 2) * M + loop_m)));
+      vec_line[3] = lsx_set1_i16(
+          *(reinterpret_cast<int16_t *>(A + (loop_k + 3) * M + loop_m)));
+      TRANSPOSEA_4x2 out_ptr += 2 * 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      ZERO_ALL
+      for (int i = 0; i < remain_k; i++) {
+        vec_line[i] = lsx_set1_i16(
+            *(reinterpret_cast<int16_t *>(A + (loop_k + i) * M + loop_m)));
+      }
+      TRANSPOSEA_4x2 out_ptr += 2 * 4;
+    }
+  }
+  for (; loop_m < M; loop_m++) {
+    for (loop_k = 0; loop_k + 3 < K; loop_k += 4) {
+      out_ptr[0] = *(A + loop_k * M + loop_m);
+      out_ptr[1] = *(A + (loop_k + 1) * M + loop_m);
+      out_ptr[2] = *(A + (loop_k + 2) * M + loop_m);
+      out_ptr[3] = *(A + (loop_k + 3) * M + loop_m);
+      out_ptr += 4;
+    }
+    remain_k = K - loop_k;
+    if (remain_k > 0) {
+      ZERO_ALL
+      for (int i = 0; i < remain_k; i++) {
+        out_ptr[i] = *(A + (loop_k + i) * M + loop_m);
+      }
+      out_ptr += 4;
+    }
+  }
+}
+
+// PackA 's K dim need 4-aligned,
+// so it needs M * K_4aligned Bytes.
+void gemm_s8u8s8_prepackA(
+    int M, int K, const int8_t *A, int8_t *pack_A, bool is_trans) {
+  if (is_trans) {
+    packA_i8_trans(M, K, A, pack_A);
+  } else {
+    packA_i8_notrans(M, K, A, pack_A);
+  }
+}
+
+#undef TRANSPOSE_4x32
+#undef TRANSPOSEA_4x16
+#undef TRANSPOSEA_4x8
+#undef TRANSPOSEA_4x4
+#undef TRANSPOSEA_4x2
+#undef ZERO_ALL
+#undef INT8_ADD_128
+#undef STORE_4x32
+#undef STORE_4x24
+#undef STORE_4x16
+#undef STORE_4x8
+#undef STORE_4x4
+#undef STORE_4x2
+#undef STORE_4x1
+#undef LOAD_32
+#undef LOAD_EPI32
+#undef LOAD_EPI64
+#undef LOAD_REMAIN
+#undef LOAD_REMAIN_EPI64
+#undef LOAD_REMAIN_EPI32
+#undef INT8_ADD_128_HALF
+#undef TRANSPOSE_STORE_4x16
+#undef TRANSPOSE_STORE_4x8
+#undef TRANSPOSE_STORE_2x16
+#undef TRANSPOSE_STORE_2x8
+#undef TRANSPOSE_4x16
+#undef TRANSPOSE_4x8
+#undef TRANSPOSE_4x4
+#undef TRANSPOSE_4xX
+#undef TRANSPOSE_2x16
+#undef TRANSPOSE_2x8
+#undef TRANSPOSE_2x4
+#undef TRANSPOSE_2xX
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
+
diff --git a/lite/backends/loongarch/math/gemm_s8u8_pack.h b/lite/backends/loongarch/math/gemm_s8u8_pack.h
new file mode 100644
index 00000000000..8f423d28fcc
--- /dev/null
+++ b/lite/backends/loongarch/math/gemm_s8u8_pack.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2021 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+#define TRANS_INT8_UINT8_OFFT (128)
+
+// PackA 's K dim need 4-aligned,
+// so it needs M * K_4aligned Bytes.
+void gemm_s8u8s8_prepackA(
+    int M, int K, const int8_t* A, int8_t* pack_A, bool is_trans);
+
+void gemm_s8u8s8_runpackB(
+    int N, int K, int stride, const int8_t* B, uint8_t* pack_B, bool is_trans);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/group_norm.cc b/lite/backends/loongarch/math/group_norm.cc
new file mode 100644
index 00000000000..134f3b1d8ae
--- /dev/null
+++ b/lite/backends/loongarch/math/group_norm.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/include/group_norm.h"
+#include "lite/backends/loongarch/xxl.h"
+#include <stdio.h>
+#include <cmath>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void group_norm(const float* in,
+                float* out,
+                const int n,
+                const int c,
+                const int height,
+                const int width,
+                const float epsilon,
+                const int groups,
+                const float* scale,
+                const float* bias,
+                float* saved_mean,
+                float* saved_variance) {
+  int nb = n;
+  int spatial_size = height * width;
+  int group_size = (c - 1) / groups +
+                   1;  // equal to instance_norm if the groups value equals to c
+
+// compute saved_mean and saved_variance
+#pragma omp parallel for
+  for (int i = 0; i < nb; i++) {
+    for (int gid = 0; gid < groups; gid++) {
+      float sum_spatial = 0.f;
+      float summ_spatial = 0.f;
+      const float* in_data =
+          in + (i * groups + gid) * group_size * spatial_size;  // input offset
+      float* out_data =
+          out +
+          (i * groups + gid) * group_size * spatial_size;  // output offset
+      int number = (group_size > (c - gid * group_size))
+                       ? (c - gid * group_size)
+                       : group_size;
+
+      // for each group
+      for (int nid = 0; nid < number; nid++) {
+        const float* in_p = in_data + nid * spatial_size;
+        // for each image size w x h
+        for (int h = 0; h < height; ++h) {
+          int w = width;
+
+          __m128 sum0 = lsx_set1_f32(0.f);
+          __m128 sum1 = lsx_set1_f32(0.f);
+          __m128 sum2 = lsx_set1_f32(0.f);
+          __m128 sum3 = lsx_set1_f32(0.f);
+          __m128 square_sum0 = lsx_set1_f32(0.f);
+          __m128 square_sum1 = lsx_set1_f32(0.f);
+          __m128 square_sum2 = lsx_set1_f32(0.f);
+          __m128 square_sum3 = lsx_set1_f32(0.f);
+          __m128 in0, in1, in2, in3;
+          for (; w > 15; w -= 16) {
+            in0 = lsx_loadu_f32(in_p);
+            in1 = lsx_loadu_f32(in_p + 4);
+            in2 = lsx_loadu_f32(in_p + 8);
+            in3 = lsx_loadu_f32(in_p + 12);
+            // add x
+            sum0 = lsx_add_f32(sum0, in0);
+            sum1 = lsx_add_f32(sum1, in1);
+            sum2 = lsx_add_f32(sum2, in2);
+            sum3 = lsx_add_f32(sum3, in3);
+            // add x * x
+            square_sum0 = lsx_fmadd_f32(in0, in0, square_sum0);
+            square_sum1 = lsx_fmadd_f32(in1, in1, square_sum1);
+            square_sum2 = lsx_fmadd_f32(in2, in2, square_sum2);
+            square_sum3 = lsx_fmadd_f32(in3, in3, square_sum3);
+
+            in_p += 16;
+          }
+          for (; w > 7; w -= 8) {
+            in0 = lsx_loadu_f32(in_p);
+            in1 = lsx_loadu_f32(in_p + 4);
+            sum0 = lsx_add_f32(sum0, in0);
+            sum1 = lsx_add_f32(sum1, in1);
+            square_sum0 = lsx_fmadd_f32(in0, in0, square_sum0);
+            square_sum1 = lsx_fmadd_f32(in1, in1, square_sum1);
+            in_p += 8;
+          }
+          for (; w > 3; w -= 4) {
+            in0 = lsx_loadu_f32(in_p);
+            sum0 = lsx_add_f32(sum0, in0);
+            square_sum0 = lsx_fmadd_f32(in0, in0, square_sum0);
+            in_p += 4;
+          }
+          float sum = 0.f;
+          float summ = 0.f;
+          for (; w > 0; w--) {
+            sum += *in_p;
+            summ += (*in_p) * (*in_p);
+            in_p++;
+          }
+
+          sum0 = lsx_add_f32(sum0, sum1);
+          sum2 = lsx_add_f32(sum2, sum3);
+          square_sum0 = lsx_add_f32(square_sum0, square_sum1);
+          square_sum2 = lsx_add_f32(square_sum2, square_sum3);
+
+          sum0 = lsx_add_f32(sum0, sum2);
+          square_sum0 = lsx_add_f32(square_sum0, square_sum2);
+
+          __m128 r = lsx_hadd_f32(sum0, square_sum0);
+          r = lsx_hadd_f32(r, r);
+          float buf[4];
+          lsx_storeu_f32(buf, r);
+          sum += buf[0];
+          summ += buf[1];
+          // accumulation
+          sum_spatial += sum;
+          summ_spatial += summ;
+        }
+      }
+
+      float mean = sum_spatial / (number * spatial_size);
+      // float x_var = summ_spatial / (number * spatial_size);
+      // float variance = summ_spatial / (number * spatial_size) - mean * mean;
+      // the flolowing code has higher precision than above comment code
+      float variance = (summ_spatial - mean * mean * spatial_size * number) /
+                       (number * spatial_size);
+      float std = 1.f / sqrtf(variance + epsilon);
+
+      saved_mean[i * groups + gid] = mean;
+      saved_variance[i * groups + gid] = variance;
+
+      // compute each group_norm result: out = scale * (in - mean) / std + bias
+      for (int nid = 0; nid < number; nid++) {
+        const float* in_p = in_data + nid * spatial_size;
+        float* out_p = out_data + nid * spatial_size;
+
+        int j = spatial_size;
+        const float sstd_val =
+            scale == nullptr ? std : scale[gid * group_size + nid] * std;
+        const float bias_val =
+            bias == nullptr ? 0. : bias[gid * group_size + nid];
+        const float mean_val = mean;
+        const __m128 vsstd = lsx_set1_f32(sstd_val);
+        const __m128 vbias = lsx_set1_f32(bias_val);
+        const __m128 vmean = lsx_set1_f32(mean_val);
+        __m128 in0, in1, submean0, submean1, out0, out1;
+
+        for (; j > 7; j -= 8) {
+          in0 = lsx_loadu_f32(in_p);
+          in1 = lsx_loadu_f32(in_p + 4);
+          submean0 = lsx_sub_f32(in0, vmean);
+          submean1 = lsx_sub_f32(in1, vmean);
+          out0 = lsx_fmadd_f32(submean0, vsstd, vbias);
+          out1 = lsx_fmadd_f32(submean1, vsstd, vbias);
+
+          lsx_storeu_f32(out_p, out0);
+          lsx_storeu_f32(out_p + 4, out1);
+
+          in_p += 8;
+          out_p += 8;
+        }
+        for (; j > 3; j -= 4) {
+          in0 = lsx_loadu_f32(in_p);
+          submean0 = lsx_sub_f32(in0, vmean);
+          out0 = lsx_fmadd_f32(submean0, vsstd, vbias);
+
+          lsx_storeu_f32(out_p, out0);
+
+          in_p += 4;
+          out_p += 4;
+        }
+        for (; j > 0; j--) {
+          *out_p = (*in_p - mean_val) * sstd_val + bias_val;
+          in_p++;
+          out_p++;
+        }
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/gru_compute.cc b/lite/backends/loongarch/math/gru_compute.cc
new file mode 100644
index 00000000000..ce8f0e5ce6e
--- /dev/null
+++ b/lite/backends/loongarch/math/gru_compute.cc
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/gru_compute.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/gru_cpu_kernel.h"
+#include "lite/backends/loongarch/math/gru_kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<lite::TargetType::kLoongArch, T> {
+  static void compute(const lite::LoongArchContext &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate,
+                      bool origin_mode) {
+    auto blas = math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size * 2,
+                frame_size,
+                1,
+                value.prev_out_value,
+                frame_size,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                value.gate_value,
+                frame_size * 3);
+    }
+
+    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(),
+                                 value,
+                                 frame_size,
+                                 batch_size,
+                                 active_gate);
+
+    if (value.prev_out_value) {
+      blas.GEMM(false,
+                false,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                value.reset_output_value,
+                frame_size,
+                value.state_weight,
+                frame_size,
+                1,
+                value.gate_value + frame_size * 2,
+                frame_size * 3);
+    }
+
+    detail::forward_final_output(detail::forward::gru_finalOutput<T>(),
+                                 value,
+                                 frame_size,
+                                 batch_size,
+                                 active_node,
+                                 origin_mode);
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<lite::TargetType::kLoongArch, T> {
+  static void compute(const lite::LoongArchContext &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate,
+                      bool origin_mode) {
+    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(),
+                                value,
+                                grad,
+                                frame_size,
+                                batch_size,
+                                active_node,
+                                origin_mode);
+    auto blas = math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    if (value.prev_out_value && grad.prev_out_grad) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size,
+                1,
+                grad.gate_grad + frame_size * 2,
+                frame_size * 3,
+                value.state_weight,
+                frame_size,
+                0,
+                grad.reset_output_grad,
+                frame_size);
+
+      if (grad.state_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size,
+                  batch_size,
+                  1,
+                  value.reset_output_value,
+                  frame_size,
+                  grad.gate_grad + frame_size * 2,
+                  frame_size * 3,
+                  1,
+                  grad.state_weight_grad,
+                  frame_size);
+      }
+    }
+
+    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(),
+                                value,
+                                grad,
+                                frame_size,
+                                batch_size,
+                                active_gate);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      blas.GEMM(false,
+                true,
+                batch_size,
+                frame_size,
+                frame_size * 2,
+                1,
+                grad.gate_grad,
+                frame_size * 3,
+                value.gate_weight,
+                frame_size * 2,
+                1,
+                grad.prev_out_grad,
+                frame_size);
+
+      if (grad.gate_weight_grad) {
+        blas.GEMM(true,
+                  false,
+                  frame_size,
+                  frame_size * 2,
+                  batch_size,
+                  1,
+                  value.prev_out_value,
+                  frame_size,
+                  grad.gate_grad,
+                  frame_size * 3,
+                  1,
+                  grad.gate_weight_grad,
+                  frame_size * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<lite::TargetType::kLoongArch, float>;
+template struct GRUUnitFunctor<lite::TargetType::kLoongArch, double>;
+template struct GRUUnitGradFunctor<lite::TargetType::kLoongArch, float>;
+template struct GRUUnitGradFunctor<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/gru_compute.h b/lite/backends/loongarch/math/gru_compute.h
new file mode 100644
index 00000000000..dd4a67c2bb0
--- /dev/null
+++ b/lite/backends/loongarch/math/gru_compute.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "lite/backends/loongarch/math/activation_functions.h"
+#include "lite/core/context.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+struct GRUMetaValue {
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
+};
+
+template <typename T>
+struct GRUMetaGrad {
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
+};
+
+template <lite::TargetType Target, typename T>
+struct GRUUnitFunctor {
+  static void compute(const lite::Context<Target> &context,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate,
+                      bool origin_mode);
+};
+
+template <lite::TargetType Target, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const lite::Context<Target> &context,
+                      GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate,
+                      bool origin_mode);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/gru_cpu_kernel.h b/lite/backends/loongarch/math/gru_cpu_kernel.h
new file mode 100644
index 00000000000..c601cb5e04a
--- /dev/null
+++ b/lite/backends/loongarch/math/gru_cpu_kernel.h
@@ -0,0 +1,604 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "lite/backends/loongarch/math/activation_functions.h"
+#include "lite/backends/loongarch/math/gru_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+namespace detail {
+
+template <class OpResetOutput, typename T>
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value,
+                                       T *reset_output_value,
+                                       T *prev_output_value,
+                                       int frame_size,
+                                       ActivationType active_gate) {
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_reset_output(&r_value_update_gate,
+                    &r_value_reset_gate,
+                    &r_prev_out,
+                    &r_value_reset_output,
+                    active_gate);
+
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
+  }
+}
+
+template <class OpFinalOutput, typename T>
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value,
+                                       T *prev_output_value,
+                                       T *output_value,
+                                       int frame_size,
+                                       ActivationType active_node,
+                                       bool origin_mode) {
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_final_output(&r_value_update_gate,
+                    &r_value_frame_state,
+                    &r_prev_out,
+                    &r_output,
+                    active_node,
+                    origin_mode);
+
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
+  }
+}
+
+template <class OpResetOutput, typename T>
+void hl_lasx_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                      T *gate_value,
+                                      T *reset_output_value,
+                                      T *prev_output_value,
+                                      int frame_size,
+                                      ActivationType active_gate) {
+#ifdef __loongarch_asx
+  __m256 r_value_update_gate, r_value_update_gate_last = lasx_set1_f32(0.0f);
+  __m256 r_value_reset_gate, r_value_reset_gate_last = lasx_set1_f32(0.0f);
+  __m256 r_value_reset_output;
+  __m256 r_prev_out = lasx_set1_f32(0.0f),
+         r_prev_out_last = lasx_set1_f32(0.0f);
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+  int block = 8;
+  const int n = frame_size;
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+
+  if (rest > 0) {
+    i = n - block;
+    r_value_update_gate_last =
+        lasx_loadu_f32((const float *)(update_gate + i));
+    r_value_reset_gate_last = lasx_loadu_f32((const float *)(reset_gate + i));
+    if (prev_output_value) {
+      r_prev_out_last = lasx_loadu_f32((const float *)(prev_output_value + i));
+    }
+  }
+
+  for (i = 0; i < end; i += block) {
+    r_value_update_gate = lasx_loadu_f32((const float *)(update_gate + i));
+    r_value_reset_gate = lasx_loadu_f32((const float *)(reset_gate + i));
+    if (prev_output_value) {
+      r_prev_out = lasx_loadu_f32((const float *)(prev_output_value + i));
+    }
+
+    op_reset_output(&r_value_update_gate,
+                    &r_value_reset_gate,
+                    &r_prev_out,
+                    &r_value_reset_output,
+                    active_gate);
+
+    lasx_storeu_f32(reinterpret_cast<float *>(update_gate + i),
+                     r_value_update_gate);
+    lasx_storeu_f32(reinterpret_cast<float *>(reset_gate + i),
+                     r_value_reset_gate);
+    lasx_storeu_f32(reinterpret_cast<float *>(reset_output_value + i),
+                     r_value_reset_output);
+  }
+
+  if (rest > 0) {
+    i = n - block;
+
+    op_reset_output(&r_value_update_gate_last,
+                    &r_value_reset_gate_last,
+                    &r_prev_out_last,
+                    &r_value_reset_output,
+                    active_gate);
+
+    lasx_storeu_f32(reinterpret_cast<float *>(update_gate + i),
+                     r_value_update_gate_last);
+    lasx_storeu_f32(reinterpret_cast<float *>(reset_gate + i),
+                     r_value_reset_gate_last);
+    lasx_storeu_f32(reinterpret_cast<float *>(reset_output_value + i),
+                     r_value_reset_output);
+  }
+#endif
+}
+
+template <class OpFinalOutput, typename T>
+void hl_lasx_gru_forward_final_output(OpFinalOutput op_final_output,
+                                      T *gate_value,
+                                      T *prev_output_value,
+                                      T *output_value,
+                                      int frame_size,
+                                      ActivationType active_node,
+                                      bool origin_mode) {
+#ifdef __loongarch_asx
+  __m256 r_value_update_gate, r_value_update_gate_last = lasx_set1_f32(0.0f);
+  __m256 r_value_frame_state, r_value_frame_state_last = lasx_set1_f32(0.0f);
+  __m256 r_prev_out = lasx_set1_f32(0.0f),
+         r_prev_out_last = lasx_set1_f32(0.0f);
+  __m256 r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+  int block = 8;
+  const int n = frame_size;
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+
+  if (rest > 0) {
+    i = n - block;
+    r_value_update_gate_last =
+        lasx_loadu_f32((const float *)(update_gate + i));
+    r_value_frame_state_last =
+        lasx_loadu_f32((const float *)(frame_state + i));
+    if (prev_output_value) {
+      r_prev_out_last = lasx_loadu_f32((const float *)(prev_output_value + i));
+    }
+  }
+
+  for (i = 0; i < end; i += block) {
+    r_value_update_gate = lasx_loadu_f32((const float *)(update_gate + i));
+    r_value_frame_state = lasx_loadu_f32((const float *)(frame_state + i));
+    if (prev_output_value) {
+      r_prev_out = lasx_loadu_f32((const float *)(prev_output_value + i));
+    }
+
+    op_final_output(&r_value_update_gate,
+                    &r_value_frame_state,
+                    &r_prev_out,
+                    &r_output,
+                    active_node,
+                    origin_mode);
+
+    lasx_storeu_f32(reinterpret_cast<float *>(frame_state + i),
+                     r_value_frame_state);
+    lasx_storeu_f32(reinterpret_cast<float *>(output_value + i), r_output);
+  }
+
+  if (rest > 0) {
+    i = n - block;
+    op_final_output(&r_value_update_gate_last,
+                    &r_value_frame_state_last,
+                    &r_prev_out_last,
+                    &r_output,
+                    active_node,
+                    origin_mode);
+
+    lasx_storeu_f32(reinterpret_cast<float *>(frame_state + i),
+                     r_value_frame_state_last);
+    lasx_storeu_f32(reinterpret_cast<float *>(output_value + i), r_output);
+  }
+
+#endif
+}
+
+template <class OpResetOutput, typename T>
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 GRUMetaValue<T> value,
+                                 int frame_size,
+                                 int batch_size,
+                                 ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::lasx && (frame_size > static_cast<int>(8 - 1)) &&
+        (sizeof(T) == 4)) {
+      hl_lasx_gru_forward_reset_output(op_reset_output,
+                                       value.gate_value,
+                                       value.reset_output_value,
+                                       value.prev_out_value,
+                                       frame_size,
+                                       active_gate);
+    } else {
+      hl_naive_gru_forward_reset_output(op_reset_output,
+                                        value.gate_value,
+                                        value.reset_output_value,
+                                        value.prev_out_value,
+                                        frame_size,
+                                        active_gate);
+    }
+
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpFinalOutput, typename T>
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 GRUMetaValue<T> value,
+                                 int frame_size,
+                                 int batch_size,
+                                 ActivationType active_node,
+                                 bool origin_mode) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpFinalOutput::lasx && (frame_size > static_cast<int>(8 - 1)) &&
+        (sizeof(T) == 4)) {
+      hl_lasx_gru_forward_final_output(op_final_output,
+                                       value.gate_value,
+                                       value.prev_out_value,
+                                       value.output_value,
+                                       frame_size,
+                                       active_node,
+                                       origin_mode);
+    } else {
+      hl_naive_gru_forward_final_output(op_final_output,
+                                        value.gate_value,
+                                        value.prev_out_value,
+                                        value.output_value,
+                                        frame_size,
+                                        active_node,
+                                        origin_mode);
+    }
+
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad,
+                                      T *gate_value,
+                                      T *gate_grad,
+                                      T *prev_out_value,
+                                      T *prev_out_grad,
+                                      T *output_grad,
+                                      int frame_size,
+                                      ActivationType active_node,
+                                      bool origin_mode) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_frame_state_value;
+  T r_frame_state_grad;
+  T r_out_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *frame_state_value = gate_value + frame_size * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = output_grad[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
+    }
+
+    op_state_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_frame_state_value,
+                  &r_frame_state_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_out_grad,
+                  active_node,
+                  origin_mode);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad,
+                                      T *gate_value,
+                                      T *gate_grad,
+                                      T *prev_out_value,
+                                      T *prev_out_grad,
+                                      T *reset_output_grad,
+                                      int frame_size,
+                                      ActivationType active_gate) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_reset_gate_value;
+  T r_reset_gate_grad;
+  T r_reset_output_grad = 0;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *reset_gate_value = gate_value + frame_size;
+  T *reset_gate_grad = gate_grad + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = reset_output_grad[i];
+    }
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
+    }
+
+    op_reset_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_reset_gate_value,
+                  &r_reset_gate_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_reset_output_grad,
+                  active_gate);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_lasx_gru_backward_state_grad(OpStateGrad op_state_grad,
+                                     T *gate_value,
+                                     T *gate_grad,
+                                     T *prev_out_value,
+                                     T *prev_out_grad,
+                                     T *output_grad,
+                                     int frame_size,
+                                     ActivationType active_node,
+                                     bool origin_mode) {
+#ifdef __loongarch_asx
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_frame_state_value;
+  __m256 r_frame_state_grad;
+  __m256 r_out_grad;
+  __m256 r_prev_out_value = lasx_set1_f32(0.0f);
+  __m256 r_prev_out_grad = lasx_set1_f32(0.0f);
+  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
+  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
+  __m256 *frame_state_value =
+      reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
+  __m256 *frame_state_grad =
+      reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
+    if (prev_out_value) {
+      r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
+    }
+
+    op_state_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_frame_state_value,
+                  &r_frame_state_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_out_grad,
+                  active_node,
+                  origin_mode);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
+    }
+  }
+#endif
+}
+
+template <class OpResetGrad, typename T>
+void hl_lasx_gru_backward_reset_grad(OpResetGrad op_reset_grad,
+                                     T *gate_value,
+                                     T *gate_grad,
+                                     T *prev_out_value,
+                                     T *prev_out_grad,
+                                     T *reset_output_grad,
+                                     int frame_size,
+                                     ActivationType active_gate) {
+#ifdef __loongarch_asx
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_reset_gate_value;
+  __m256 r_reset_gate_grad;
+  __m256 r_reset_output_grad = lasx_set1_f32(0.0f);
+  __m256 r_prev_out_value = lasx_set1_f32(0.0f);
+  __m256 r_prev_out_grad = lasx_set1_f32(0.0f);
+  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
+  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
+  __m256 *reset_gate_value =
+      reinterpret_cast<__m256 *>(gate_value + frame_size);
+  __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
+    }
+    if (prev_out_value) {
+      r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
+    }
+
+    op_reset_grad(&r_update_gate_value,
+                  &r_update_gate_grad,
+                  &r_reset_gate_value,
+                  &r_reset_gate_grad,
+                  &r_prev_out_value,
+                  &r_prev_out_grad,
+                  &r_reset_output_grad,
+                  active_gate);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
+    }
+  }
+#endif
+}
+
+template <class OpStateGrad, typename T>
+inline void backward_state_grad(OpStateGrad op_state_grad,
+                                GRUMetaValue<T> value,
+                                GRUMetaGrad<T> grad,
+                                int frame_size,
+                                int batch_size,
+                                ActivationType active_node,
+                                bool origin_mode) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::lasx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_lasx_gru_backward_state_grad(op_state_grad,
+                                      value.gate_value,
+                                      grad.gate_grad,
+                                      value.prev_out_value,
+                                      grad.prev_out_grad,
+                                      grad.output_grad,
+                                      frame_size,
+                                      active_node,
+                                      origin_mode);
+    } else {
+      hl_naive_gru_backward_state_grad(op_state_grad,
+                                       value.gate_value,
+                                       grad.gate_grad,
+                                       value.prev_out_value,
+                                       grad.prev_out_grad,
+                                       grad.output_grad,
+                                       frame_size,
+                                       active_node,
+                                       origin_mode);
+    }
+
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+
+    grad.gate_grad += frame_size * 3;
+    grad.output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+                                GRUMetaValue<T> value,
+                                GRUMetaGrad<T> grad,
+                                int frame_size,
+                                int batch_size,
+                                ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::lasx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_lasx_gru_backward_reset_grad(op_reset_grad,
+                                      value.gate_value,
+                                      grad.gate_grad,
+                                      value.prev_out_value,
+                                      grad.prev_out_grad,
+                                      grad.reset_output_grad,
+                                      frame_size,
+                                      active_gate);
+    } else {
+      hl_naive_gru_backward_reset_grad(op_reset_grad,
+                                       value.gate_value,
+                                       grad.gate_grad,
+                                       value.prev_out_value,
+                                       grad.prev_out_grad,
+                                       grad.reset_output_grad,
+                                       frame_size,
+                                       active_gate);
+    }
+
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+
+    grad.gate_grad += frame_size * 3;
+    grad.reset_output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/gru_kernel.h b/lite/backends/loongarch/math/gru_kernel.h
new file mode 100644
index 00000000000..efcbd203d16
--- /dev/null
+++ b/lite/backends/loongarch/math/gru_kernel.h
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "lite/backends/loongarch/math/activation_functions.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <typename T>
+class gru_resetOutput {
+ public:
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *value_reset_gate,
+                             T *prev_out,
+                             T *value_reset_output,
+                             ActivationType act_gate) {
+    *value_update_gate = activation(*value_update_gate, act_gate);
+    *value_reset_gate = activation(*value_reset_gate, act_gate);
+    *value_reset_output = (*prev_out) * (*value_reset_gate);
+  }
+#ifndef __loongarch_asx
+  static const bool lasx = false;
+#else
+  static const bool lasx = true;
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *value_reset_gate,
+                             __m256 *prev_out,
+                             __m256 *value_reset_output,
+                             ActivationType act_gate) {
+    *value_update_gate = activation(*value_update_gate, act_gate);
+    *value_reset_gate = activation(*value_reset_gate, act_gate);
+    *value_reset_output = lasx_mul_f32(*prev_out, *value_reset_gate);
+  }
+#endif
+};
+
+template <typename T>
+class gru_finalOutput {
+ public:
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *value_frame_state,
+                             T *prev_out,
+                             T *value_output,
+                             ActivationType act_input,
+                             bool origin_mode) {
+    *value_frame_state = activation(*value_frame_state, act_input);
+    if (origin_mode) {
+      *value_output = ((*value_update_gate) * (*prev_out)) +
+                      *value_frame_state -
+                      ((*value_update_gate) * (*value_frame_state));
+    } else {
+      *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
+                      ((*value_update_gate) * (*value_frame_state));
+    }
+  }
+#ifndef __loongarch_asx
+  static const bool lasx = false;
+#else
+  static const bool lasx = true;
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *value_frame_state,
+                             __m256 *prev_out,
+                             __m256 *value_output,
+                             ActivationType act_input,
+                             bool origin_mode) {
+    *value_frame_state = activation(*value_frame_state, act_input);
+    if (origin_mode) {
+      *value_output = lasx_sub_f32(
+          lasx_add_f32(lasx_mul_f32(*value_update_gate, *prev_out),
+                        *value_frame_state),
+          lasx_mul_f32(*value_update_gate, *value_frame_state));
+    } else {
+      *value_output = lasx_add_f32(
+          lasx_sub_f32(*prev_out,
+                        lasx_mul_f32(*value_update_gate, *prev_out)),
+          lasx_mul_f32(*value_update_gate, *value_frame_state));
+    }
+  }
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+class gru_stateGrad {
+ public:
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *grad_update_gate,
+                             T *value_frame_state,
+                             T *grad_frame_state,
+                             T *value_prev_out,
+                             T *grad_prev_out,
+                             T *grad_output,
+                             ActivationType act_input,
+                             bool origin_mode) {
+    if (origin_mode) {
+      *grad_update_gate =
+          (*grad_output) * ((*value_prev_out) - (*value_frame_state));
+      *grad_prev_out += (*grad_output * (*value_update_gate));
+      *grad_frame_state = activation(
+          *grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
+          *value_frame_state,
+          act_input);
+    } else {
+      *grad_update_gate =
+          (*grad_output) * ((*value_frame_state) - (*value_prev_out));
+      *grad_prev_out +=
+          (*grad_output * (static_cast<T>(1.0) - *value_update_gate));
+      *grad_frame_state = activation(
+          *grad_output * (*value_update_gate), *value_frame_state, act_input);
+    }
+  }
+#ifndef __loongarch_asx
+  static const bool lasx = false;
+#else
+  static const bool lasx = true;
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *grad_update_gate,
+                             __m256 *value_frame_state,
+                             __m256 *grad_frame_state,
+                             __m256 *value_prev_out,
+                             __m256 *grad_prev_out,
+                             __m256 *grad_output,
+                             ActivationType act_input,
+                             bool origin_mode) {
+    if (origin_mode) {
+      *grad_update_gate = lasx_mul_f32(
+          *grad_output, lasx_sub_f32(*value_prev_out, *value_frame_state));
+      *grad_prev_out = lasx_add_f32(
+          *grad_prev_out, lasx_mul_f32(*grad_output, *value_update_gate));
+      *grad_frame_state = activation(
+          lasx_mul_f32(
+              *grad_output,
+              lasx_sub_f32(lasx_set1_f32(1.0f), *value_update_gate)),
+          *value_frame_state,
+          act_input);
+    } else {
+      *grad_update_gate = lasx_mul_f32(
+          *grad_output, lasx_sub_f32(*value_frame_state, *value_prev_out));
+      *grad_prev_out = lasx_add_f32(
+          *grad_prev_out,
+          lasx_mul_f32(
+              *grad_output,
+              lasx_sub_f32(lasx_set1_f32(1.0f), *value_update_gate)));
+      *grad_frame_state =
+          activation(lasx_mul_f32(*grad_output, *value_update_gate),
+                     *value_frame_state,
+                     act_input);
+    }
+  }
+#endif
+};
+
+template <typename T>
+class gru_resetGrad {
+ public:
+  HOSTDEVICE void operator()(T *value_update_gate,
+                             T *grad_update_gate,
+                             T *value_reset_gate,
+                             T *grad_reset_gate,
+                             T *value_prev_out,
+                             T *grad_prev_out,
+                             T *grad_reset_output,
+                             ActivationType act_gate) {
+    *grad_reset_gate = (*grad_reset_output * (*value_prev_out));
+    *grad_prev_out += (*grad_reset_output * (*value_reset_gate));
+    *grad_update_gate =
+        activation(*grad_update_gate, *value_update_gate, act_gate);
+    *grad_reset_gate =
+        activation(*grad_reset_gate, *value_reset_gate, act_gate);
+  }
+#ifndef __loongarch_asx
+  static const bool lasx = false;
+#else
+  static const bool lasx = true;
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *grad_update_gate,
+                             __m256 *value_reset_gate,
+                             __m256 *grad_reset_gate,
+                             __m256 *value_prev_out,
+                             __m256 *grad_prev_out,
+                             __m256 *grad_reset_output,
+                             ActivationType act_gate) {
+    *grad_reset_gate = lasx_mul_f32(*grad_reset_output, *value_prev_out);
+    *grad_prev_out = lasx_add_f32(
+        *grad_prev_out, lasx_mul_f32(*grad_reset_output, *value_reset_gate));
+    *grad_update_gate =
+        activation(*grad_update_gate, *value_update_gate, act_gate);
+    *grad_reset_gate =
+        activation(*grad_reset_gate, *value_reset_gate, act_gate);
+  }
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/im2col.cc b/lite/backends/loongarch/math/im2col.cc
new file mode 100644
index 00000000000..126e6ef67c1
--- /dev/null
+++ b/lite/backends/loongarch/math/im2col.cc
@@ -0,0 +1,292 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/im2col.h"
+#include <vector>
+#include "lite/backends/loongarch/math/im2col_cfo_cpu.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<lite::loongarch::math::ColFormat::kCFO,
+                    lite::TargetType::kLoongArch,
+                    T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& im,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding,
+                  lite::Tensor* col) {
+    CHECK_EQ(im.dims().size(), 3);
+    CHECK_EQ(col->dims().size(), 5);
+
+    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
+        dilation[1] == 1) {
+      if (padding[0] == 0 && padding[1] == 0) {
+        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
+        return;
+      } else if (padding[0] == 1 && padding[1] == 1) {
+        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
+        return;
+      }
+      // TODO(TJ): complete padding >=2
+    }
+    im2col_common<T>(im, dilation, stride, padding, col);
+  }
+};
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<lite::loongarch::math::ColFormat::kCFO,
+                    lite::TargetType::kLoongArch,
+                    T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding,
+                  lite::Tensor* im) {
+    CHECK_EQ(im->dims().size(), 3);
+    CHECK_EQ(col.dims().size(), 5);
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
+
+    CHECK_EQ((im_height + padding[0] + padding[2] -
+              ((dilation[0] * (filter_height - 1) + 1))) /
+                     stride[0] +
+                 1,
+             col_height)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+    CHECK_EQ((im_width + padding[1] + padding[3] -
+              ((dilation[1] * (filter_width - 1) + 1))) /
+                     stride[1] +
+                 1,
+             col_width)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+
+    int channels_col = im_channels * filter_height * filter_width;
+
+    T* im_data = im->template mutable_data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
+              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
+            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
+                col_data[(c * col_height + h) * col_width + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<lite::loongarch::math::ColFormat::kCFO,
+                             lite::TargetType::kLoongArch,
+                             float>;
+template class Im2ColFunctor<lite::loongarch::math::ColFormat::kCFO,
+                             lite::TargetType::kLoongArch,
+                             double>;
+template class Col2ImFunctor<lite::loongarch::math::ColFormat::kCFO,
+                             lite::TargetType::kLoongArch,
+                             float>;
+template class Col2ImFunctor<lite::loongarch::math::ColFormat::kCFO,
+                             lite::TargetType::kLoongArch,
+                             double>;
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<lite::loongarch::math::ColFormat::kOCF,
+                    lite::TargetType::kLoongArch,
+                    T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& im,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding,
+                  lite::Tensor* col) {
+    CHECK_EQ(im.dims().size(), 3);
+    CHECK_EQ(col->dims().size(), 5);
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
+
+    const T* im_data = im.data<T>();
+    T* col_data = col->template mutable_data<T>();
+
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
+              int col_offset =
+                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              int im_offset = (channel * im_height + im_row_offset) * im_width +
+                              im_col_offset;
+              col_data[col_offset] =
+                  (im_row_offset < 0 || im_row_offset >= im_height ||
+                   im_col_offset < 0 || im_col_offset >= im_width)
+                      ? static_cast<T>(0)
+                      : im_data[im_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<lite::loongarch::math::ColFormat::kOCF,
+                    lite::TargetType::kLoongArch,
+                    T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding,
+                  lite::Tensor* im) {
+    CHECK_EQ(im->dims().size(), 3);
+    CHECK_EQ(col.dims().size(), 5);
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
+
+    CHECK_EQ(
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+    CHECK_EQ(
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width)
+        << "col_width and padding(padding_left, padding_right) are "
+           "inconsistent.";
+
+    T* im_data = im->template mutable_data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
+              int col_offset =
+                  (((col_row_idx * col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              if (im_row_offset >= 0 && im_row_offset < im_height &&
+                  im_col_offset >= 0 && im_col_offset < im_width) {
+                int im_offset =
+                    (channel * im_height + im_row_offset) * im_width +
+                    im_col_offset;
+                im_data[im_offset] += col_data[col_offset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<lite::loongarch::math::ColFormat::kOCF,
+                             lite::TargetType::kLoongArch,
+                             float>;
+template class Im2ColFunctor<lite::loongarch::math::ColFormat::kOCF,
+                             lite::TargetType::kLoongArch,
+                             double>;
+template class Col2ImFunctor<lite::loongarch::math::ColFormat::kOCF,
+                             lite::TargetType::kLoongArch,
+                             float>;
+template class Col2ImFunctor<lite::loongarch::math::ColFormat::kOCF,
+                             lite::TargetType::kLoongArch,
+                             double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/im2col.h b/lite/backends/loongarch/math/im2col.h
new file mode 100644
index 00000000000..c448ca58245
--- /dev/null
+++ b/lite/backends/loongarch/math/im2col.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum class ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [input_channels, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 2-dimension  [dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 2-dimension  [stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [input_channels, filter_height, filter_width, output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_height * filter_width, and the width is equal
+ * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_height,
+ *      output_width]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [output_height, output_width, input_channels, filter_height, filter_width]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seq_length, step_size], where the seq_length
+ * is equal output_height * output_width, and the step_size is equal
+ * input_channels * filter_height * filter_width.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [output_height,
+ *      output_width,
+ *      input_channels,    ======>    [seqLength, stepSize]
+ *      filter_height,
+ *      filter_width]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, lite::TargetType Target, typename T>
+class Im2ColFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& im,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding,
+                  lite::Tensor* col);
+};
+
+template <ColFormat Format, lite::TargetType Target, typename T>
+class Col2ImFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding,
+                  lite::Tensor* im);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/im2col_cfo_cpu.h b/lite/backends/loongarch/math/im2col_cfo_cpu.h
new file mode 100644
index 00000000000..92278462eb7
--- /dev/null
+++ b/lite/backends/loongarch/math/im2col_cfo_cpu.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/**
+ * The most common im2col algorithm.
+ * Support dilation, stride and padding.
+ */
+template <typename T>
+inline void im2col_common(const lite::Tensor& im,
+                          const std::vector<int>& dilation,
+                          const std::vector<int>& stride,
+                          const std::vector<int>& padding,
+                          lite::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->template mutable_data<T>();
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < output_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < output_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int col_idx = (c * output_height + h) * output_width + w;
+        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<T>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 0
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
+                                      lite::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->template mutable_data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+  size_t copy_size = sizeof(T) * output_width;
+  const T* im_data_oh = im_data;
+  T* dst_data_oh = col_data;
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* src_data_ic = im_data_oh;
+    T* dst_data = dst_data_oh;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = src_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          std::memcpy(dst_data, src_data + kw, copy_size);
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+      src_data_ic = src_data_ic + im_size;
+    }
+    im_data_oh = im_data_oh + im_width;
+    dst_data_oh = dst_data_oh + output_width;
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 1
+ * and filter_width == 1 have a special implementation
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
+                                      lite::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  constexpr int plh = 1;
+  constexpr int prh = 1;
+  constexpr int plw = 1;
+  constexpr int prw = 1;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->template mutable_data<T>();
+  int im_size = im_height * im_width;
+  int col_matrix_width = output_width * output_height;
+  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
+
+  // fill height padding
+  {
+    size_t copy_size = sizeof(T) * output_width;
+    T* col_start_l = col_data;
+    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
+                     col_matrix_width - output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_l = col_start_l;
+      T* dst_data_r = col_start_r;
+      for (int kw = 0; kw < filter_width; ++kw) {
+        std::memset(dst_data_l, 0, copy_size);
+        std::memset(dst_data_r, 0, copy_size);
+        dst_data_l = dst_data_l + col_matrix_width;
+        dst_data_r = dst_data_r + col_matrix_width;
+      }
+      col_start_l = col_start_l + col_block_ic;
+      col_start_r = col_start_r + col_block_ic;
+    }
+  }
+
+  auto pad = static_cast<T>(0);
+  if (filter_width == 1) {
+    // fill width padding
+    T* dst_data_ic = col_data;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_kh = dst_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data = dst_data_kh;
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width - 1;
+          *dst_data = pad;
+          ++dst_data;
+        }
+        dst_data_kh = dst_data_kh + col_block_fh;
+      }
+      dst_data_ic = dst_data_ic + col_block_ic;
+    }
+    // fill core
+    size_t copy_size = sizeof(T) * (output_width - plw - prw);
+    for (int oh = 0; oh < output_height; ++oh) {
+      const T* im_data_start =
+          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+      T* dst_data = col_data + oh * output_width;
+      for (int ic = 0; ic < im_channels; ++ic) {
+        const T* src_data = im_data_start + ic * im_size;
+        for (int kh = 0; kh < filter_height; ++kh) {
+          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                         kh > (filter_height - prh - 1))) {
+            dst_data = dst_data + col_matrix_width;
+            continue;
+          }
+          std::memcpy(dst_data + plw, src_data, copy_size);
+          dst_data = dst_data + col_matrix_width;
+          src_data = src_data + im_width;
+        }
+      }
+    }
+    return;
+  }
+
+  // filter_width != 1
+  // fill width padding
+  T* dst_data_ic = col_data;
+  for (int ic = 0; ic < im_channels; ++ic) {
+    T* dst_data_kh = dst_data_ic;
+    for (int kh = 0; kh < filter_height; ++kh) {
+      for (T* dst_data :
+           {dst_data_kh,
+            dst_data_kh + (filter_width - prw) * col_matrix_width +
+                output_width - 1}) {
+        // TODO(TJ): from plh, saving repeated assignment
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width;
+        }
+      }
+      dst_data_kh = dst_data_kh + col_block_fh;
+    }
+    dst_data_ic = dst_data_ic + col_block_ic;
+  }
+
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                       kh > (filter_height - prh - 1))) {
+          dst_data = dst_data + filter_width * col_matrix_width;
+          continue;
+        }
+        // TODO(TJ): reuse plw-kw outside this for
+        // try to unify
+        for (int kw = 0; kw < plw; ++kw) {
+          std::memcpy(dst_data + (plw - kw),
+                      src_data,
+                      sizeof(T) * (output_width - (plw - kw)));
+          dst_data = dst_data + col_matrix_width;
+        }
+        for (int kw = plw; kw < filter_width - prw; ++kw) {
+          std::memcpy(
+              dst_data, src_data + (kw - plw), sizeof(T) * output_width);
+          dst_data = dst_data + col_matrix_width;
+        }
+        int i = 1;
+        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+          std::memcpy(
+              dst_data, src_data + (kw - plw), sizeof(T) * (output_width - i));
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/include/group_norm.h b/lite/backends/loongarch/math/include/group_norm.h
new file mode 100644
index 00000000000..d23f917b884
--- /dev/null
+++ b/lite/backends/loongarch/math/include/group_norm.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void group_norm(const float* in,
+                float* out,
+                const int n,
+                const int c,
+                const int height,
+                const int width,
+                const float epsilon,
+                const int groups,
+                const float* scale,
+                const float* bias,
+                float* saved_mean,
+                float* saved_variance);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/include/instance_norm.h b/lite/backends/loongarch/math/include/instance_norm.h
new file mode 100644
index 00000000000..5892b37688d
--- /dev/null
+++ b/lite/backends/loongarch/math/include/instance_norm.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void instance_norm(const float* in,
+                   float* out,
+                   const int n,
+                   const int c,
+                   const int height,
+                   const int width,
+                   const float epsilon,
+                   const float* scale,
+                   const float* bias,
+                   float* saved_mean,
+                   float* saved_variance);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/include/mathfuns.h b/lite/backends/loongarch/math/include/mathfuns.h
new file mode 100644
index 00000000000..72c94160d7e
--- /dev/null
+++ b/lite/backends/loongarch/math/include/mathfuns.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#ifdef __loongarch_asx
+#include "lite/backends/loongarch/math/lasx/lasx_mathfuns.h"
+#endif
+
+#include "lite/backends/loongarch/math/lsx/lsx_mathfuns.h"
diff --git a/lite/backends/loongarch/math/instance_norm.cc b/lite/backends/loongarch/math/instance_norm.cc
new file mode 100644
index 00000000000..dfb5305c2e6
--- /dev/null
+++ b/lite/backends/loongarch/math/instance_norm.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/include/instance_norm.h"
+#include "lite/backends/loongarch/xxl.h"
+#include <cmath>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void instance_norm(const float* in,
+                   float* out,
+                   const int n,
+                   const int c,
+                   const int height,
+                   const int width,
+                   const float epsilon,
+                   const float* scale,
+                   const float* bias,
+                   float* saved_mean,
+                   float* saved_variance) {
+  int nc = n * c;
+  int spatial_size = height * width;
+
+// compute saved_mean and saved_variance
+#pragma omp parallel for
+  for (int i = 0; i < nc; ++i) {
+    const float* in_p = in + i * spatial_size;
+    float sum_spatial = 0.f;
+    float summ_spatial = 0.f;
+    for (int h = 0; h < height; ++h) {
+      int w = width;
+
+      __m128 sum0 = lsx_set1_f32(0.f);
+      __m128 sum1 = lsx_set1_f32(0.f);
+      __m128 sum2 = lsx_set1_f32(0.f);
+      __m128 sum3 = lsx_set1_f32(0.f);
+      __m128 square_sum0 = lsx_set1_f32(0.f);
+      __m128 square_sum1 = lsx_set1_f32(0.f);
+      __m128 square_sum2 = lsx_set1_f32(0.f);
+      __m128 square_sum3 = lsx_set1_f32(0.f);
+      __m128 in0, in1, in2, in3;
+      for (; w > 15; w -= 16) {
+        in0 = lsx_loadu_f32(in_p);
+        in1 = lsx_loadu_f32(in_p + 4);
+        in2 = lsx_loadu_f32(in_p + 8);
+        in3 = lsx_loadu_f32(in_p + 12);
+        // add x
+        sum0 = lsx_add_f32(sum0, in0);
+        sum1 = lsx_add_f32(sum1, in1);
+        sum2 = lsx_add_f32(sum2, in2);
+        sum3 = lsx_add_f32(sum3, in3);
+        // add x * x
+        square_sum0 = lsx_fmadd_f32(in0, in0, square_sum0);
+        square_sum1 = lsx_fmadd_f32(in1, in1, square_sum1);
+        square_sum2 = lsx_fmadd_f32(in2, in2, square_sum2);
+        square_sum3 = lsx_fmadd_f32(in3, in3, square_sum3);
+
+        in_p += 16;
+      }
+      for (; w > 7; w -= 8) {
+        in0 = lsx_loadu_f32(in_p);
+        in1 = lsx_loadu_f32(in_p + 4);
+        sum0 = lsx_add_f32(sum0, in0);
+        sum1 = lsx_add_f32(sum1, in1);
+        square_sum0 = lsx_fmadd_f32(in0, in0, square_sum0);
+        square_sum1 = lsx_fmadd_f32(in1, in1, square_sum1);
+        in_p += 8;
+      }
+      for (; w > 3; w -= 4) {
+        in0 = lsx_loadu_f32(in_p);
+        sum0 = lsx_add_f32(sum0, in0);
+        square_sum0 = lsx_fmadd_f32(in0, in0, square_sum0);
+        in_p += 4;
+      }
+      float sum = 0.f;
+      float summ = 0.f;
+      for (; w > 0; w--) {
+        sum += *in_p;
+        summ += (*in_p) * (*in_p);
+        in_p++;
+      }
+
+      sum0 = lsx_add_f32(sum0, sum1);
+      sum2 = lsx_add_f32(sum2, sum3);
+      square_sum0 = lsx_add_f32(square_sum0, square_sum1);
+      square_sum2 = lsx_add_f32(square_sum2, square_sum3);
+
+      sum0 = lsx_add_f32(sum0, sum2);
+      square_sum0 = lsx_add_f32(square_sum0, square_sum2);
+
+      __m128 r = lsx_hadd_f32(sum0, square_sum0);
+      r = lsx_hadd_f32(r, r);
+      float buf[4];
+      lsx_storeu_f32(buf, r);
+      sum += buf[0];
+      summ += buf[1];
+      sum_spatial += sum;
+      summ_spatial += summ;
+    }
+    float mean = sum_spatial / spatial_size;
+    // float variance = summ / spatial_size - mean * mean;
+    // the flolowing code has higher precision than above comment code
+    float variance = (summ_spatial - mean * mean * spatial_size) / spatial_size;
+    float std = 1.f / sqrtf(variance + epsilon);
+
+    saved_mean[i] = mean;
+    saved_variance[i] = std;
+  }
+// compute instance_norm result: out = scale * (in - mean) / std + bias
+#pragma omp parallel for
+  for (int i = 0; i < nc; ++i) {
+    const float* in_p = in + i * spatial_size;
+    float* out_p = out + i * spatial_size;
+    int j = spatial_size;
+    const float sstd_val =
+        scale == nullptr ? saved_variance[i] : scale[i % c] * saved_variance[i];
+    const float bias_val = bias == nullptr ? 0. : bias[i % c];
+    const float mean_val = saved_mean[i];
+    const __m128 vsstd = lsx_set1_f32(sstd_val);
+    const __m128 vbias = lsx_set1_f32(bias_val);
+    const __m128 vmean = lsx_set1_f32(mean_val);
+    __m128 in0, in1, submean0, submean1, out0, out1;
+
+    for (; j > 7; j -= 8) {
+      in0 = lsx_loadu_f32(in_p);
+      in1 = lsx_loadu_f32(in_p + 4);
+      submean0 = lsx_sub_f32(in0, vmean);
+      submean1 = lsx_sub_f32(in1, vmean);
+      out0 = lsx_fmadd_f32(submean0, vsstd, vbias);
+      out1 = lsx_fmadd_f32(submean1, vsstd, vbias);
+
+      lsx_storeu_f32(out_p, out0);
+      lsx_storeu_f32(out_p + 4, out1);
+
+      in_p += 8;
+      out_p += 8;
+    }
+    for (; j > 3; j -= 4) {
+      in0 = lsx_loadu_f32(in_p);
+      submean0 = lsx_sub_f32(in0, vmean);
+      out0 = lsx_fmadd_f32(submean0, vsstd, vbias);
+
+      lsx_storeu_f32(out_p, out0);
+
+      in_p += 4;
+      out_p += 4;
+    }
+    for (; j > 0; j--) {
+      *out_p = (*in_p - mean_val) * sstd_val + bias_val;
+      in_p++;
+      out_p++;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/interpolate.cc b/lite/backends/loongarch/math/interpolate.cc
new file mode 100644
index 00000000000..4d8c943fae7
--- /dev/null
+++ b/lite/backends/loongarch/math/interpolate.cc
@@ -0,0 +1,729 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/interpolate.h"
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void bilinear_interp(const float* input_data,
+                     float* output_data,
+                     const float ratio_h,
+                     const float ratio_w,
+                     const int h_in,
+                     const int w_in,
+                     const int n,
+                     const int c,
+                     const int h_out,
+                     const int w_out,
+                     const bool align_corners,
+                     const bool align_mode) {
+  int* buf = static_cast<int*>(
+      lite::host::malloc(sizeof(int) * (w_out * 4 + h_out * 4)));
+  int* xofs = buf;
+  int* yofs = buf + w_out * 2;
+
+  float* alpha = reinterpret_cast<float*>(buf + w_out * 2 + h_out * 2);
+  float* beta = reinterpret_cast<float*>(buf + h_out * 2 + w_out * 4);
+
+  float fx = 0.0f;
+  float fy = 0.0f;
+  int sx = 0;
+  int sy = 0;
+  if (align_corners) {
+    // calculate x axis coordinate
+    for (int dx = 0; dx < w_out; dx++) {
+      fx = dx * ratio_w;
+      sx = static_cast<int>(fx);
+      fx -= sx;
+      xofs[dx * 2] = sx;
+      xofs[dx * 2 + 1] = (sx + 1) < w_in - 1 ? (sx + 1) : (w_in - 1);
+      alpha[dx * 2] = 1.f - fx;
+      alpha[dx * 2 + 1] = fx;
+    }
+    // calculate y axis coordinate
+    for (int dy = 0; dy < h_out; dy++) {
+      fy = dy * ratio_h;
+      sy = static_cast<int>(fy);
+      fy -= sy;
+      yofs[dy * 2] = sy;
+      yofs[dy * 2 + 1] = (sy + 1) < h_in - 1 ? (sy + 1) : (h_in - 1);
+      beta[dy * 2] = 1.f - fy;
+      beta[dy * 2 + 1] = fy;
+    }
+  } else {
+    // calculate x axis coordinate
+    for (int dx = 0; dx < w_out; dx++) {
+      fx = align_mode ? ratio_w * dx : ratio_w * (dx + 0.5f) - 0.5f;
+      fx = fx < 0 ? 0.f : fx;
+      sx = static_cast<int>(fx);
+      fx -= sx;
+      xofs[dx * 2] = sx;
+      xofs[dx * 2 + 1] = (sx + 1) < w_in - 1 ? (sx + 1) : (w_in - 1);
+      alpha[dx * 2] = 1.f - fx;
+      alpha[dx * 2 + 1] = fx;
+    }
+    // calculate y axis coordinate
+    for (int dy = 0; dy < h_out; dy++) {
+      fy = align_mode ? ratio_h * dy : ratio_h * (dy + 0.5f) - 0.5f;
+      fy = fy < 0 ? 0.f : fy;
+      sy = static_cast<int>(fy);
+      fy -= sy;
+      yofs[dy * 2] = sy;
+      yofs[dy * 2 + 1] = (sy + 1) < h_in - 1 ? (sy + 1) : (h_in - 1);
+      beta[dy * 2] = 1.f - fy;
+      beta[dy * 2 + 1] = fy;
+    }
+  }
+  // output w , h boundary
+  int w_bound = w_out;
+  int h_bound = h_out;
+  if (ratio_w > 0 && ratio_h > 0) {
+    if (align_corners) {
+      w_bound = ceil((w_in - 1) / ratio_w);
+      h_bound = ceil((h_in - 1) / ratio_h);
+    } else {
+      w_bound = ceil((w_in - 0.5f) / ratio_w - 0.5f);
+      h_bound = ceil((h_in - 0.5f) / ratio_h - 0.5f);
+    }
+  }
+  int in_stride = h_in * w_in;
+  int out_stride = h_out * w_out;
+  int total = n * c;
+
+  for (int nc = 0; nc < total; ++nc) {
+    const float* src = input_data + nc * in_stride;
+    float* dst = output_data + nc * out_stride;
+    const float* betap = beta;
+
+    float* rowsbuf0 =
+        static_cast<float*>(lite::host::malloc(sizeof(int) * w_out));
+    float* rowsbuf1 =
+        static_cast<float*>(lite::host::malloc(sizeof(int) * w_out));
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    // h_bound loop
+    for (int dy = 0; dy < h_bound; dy++) {
+      int sy0 = yofs[dy * 2];
+      int sy1 = yofs[dy * 2 + 1];
+
+      const float* s0 = src + sy0 * w_in;
+      const float* s1 = src + sy1 * w_in;
+
+      const float* alphap = alpha;
+      float* rows0p = rows0;
+      float* rows1p = rows1;
+
+      int dx = 0;
+// w_bound loop
+#ifdef __loongarch_asx
+      for (; dx + 3 < w_bound; dx += 4) {
+        int x0 = xofs[dx * 2];
+        int x1 = xofs[(dx + 1) * 2];
+        int x2 = xofs[(dx + 2) * 2];
+        int x3 = xofs[(dx + 3) * 2];
+        int x01 = xofs[dx * 2 + 1];
+        int x11 = xofs[(dx + 1) * 2 + 1];
+        int x21 = xofs[(dx + 2) * 2 + 1];
+        int x31 = xofs[(dx + 3) * 2 + 1];
+
+        const float* s0p0 = s0 + x0;
+        const float* s0p1 = s0 + x1;
+        const float* s0p2 = s0 + x2;
+        const float* s0p3 = s0 + x3;
+
+        const float* s0p0_1 = s0 + x01;
+        const float* s0p1_1 = s0 + x11;
+        const float* s0p2_1 = s0 + x21;
+        const float* s0p3_1 = s0 + x31;
+
+        const float* s1p0 = s1 + x0;
+        const float* s1p1 = s1 + x1;
+        const float* s1p2 = s1 + x2;
+        const float* s1p3 = s1 + x3;
+
+        const float* s1p0_1 = s1 + x01;
+        const float* s1p1_1 = s1 + x11;
+        const float* s1p2_1 = s1 + x21;
+        const float* s1p3_1 = s1 + x31;
+
+        __m256 _a = lasx_loadu_f32(alphap);
+
+        __m256 _s0p0p3 = lasx_set_f32(
+            *s0p3_1, *s0p3, *s0p2_1, *s0p2, *s0p1_1, *s0p1, *s0p0_1, *s0p0);
+
+        __m256 _ms0 = lasx_mul_f32(_s0p0p3, _a);
+        __m256 _s1p0p3 = lasx_set_f32(
+            *s1p3_1, *s1p3, *s1p2_1, *s1p2, *s1p1_1, *s1p1, *s1p0_1, *s1p0);
+        __m256 _ms1 = lasx_mul_f32(_s1p0p3, _a);
+
+        __m256 _rows0 = lasx_hadd_f32(_ms0, _ms0);
+        __m256 _rows1 = lasx_hadd_f32(_ms1, _ms1);
+
+        __m256 _rs0 = lasx_castf64_f32(
+            lasx_permute4x64_f64(lasx_castf32_f64(_rows0), 0b11011000));
+        __m256 _rs1 = lasx_castf64_f32(
+            lasx_permute4x64_f64(lasx_castf32_f64(_rows1), 0b11011000));
+        lsx_storeu_f32(rows0p + dx, lasx_castm256_m128(_rs0));
+        lsx_storeu_f32(rows1p + dx, lasx_castm256_m128(_rs1));
+
+        alphap += 8;
+      }
+#endif
+
+      // w_bound remain loop
+      for (; dx < w_bound; ++dx) {
+        int sx = xofs[dx * 2];
+        int sx1 = xofs[dx * 2 + 1];
+        const float* s0p = s0 + sx;
+        const float* s1p = s1 + sx;
+        const float* s0p1 = s0 + sx1;
+        const float* s1p1 = s1 + sx1;
+
+        float a0 = alphap[0];
+        float a1 = alphap[1];
+        rows0p[dx] = s0p[0] * a0 + s0p1[0] * a1;
+        rows1p[dx] = s1p[0] * a0 + s1p1[0] * a1;
+        alphap += 2;
+      }
+
+      float param0 = *(src + sy0 * w_in + w_in - 1);
+      float param1 = *(src + sy1 * w_in + w_in - 1);
+      const float buffer0[2] = {param0, param0};
+      const float buffer1[2] = {param1, param1};
+#ifdef __loongarch_asx
+      __m256 _s0p0p3 = lasx_set1_f32(param0);
+      __m256 _s1p0p3 = lasx_set1_f32(param1);
+      for (; dx + 3 < w_out; dx += 4) {
+        __m256 _a = lasx_loadu_f32(alphap);
+
+        __m256 _ms0 = lasx_mul_f32(_s0p0p3, _a);
+        __m256 _ms1 = lasx_mul_f32(_s1p0p3, _a);
+
+        __m256 _rows0 = lasx_hadd_f32(_ms0, _ms0);
+        __m256 _rows1 = lasx_hadd_f32(_ms1, _ms1);
+
+        __m256 _rs0 = lasx_castf64_f32(
+            lasx_permute4x64_f64(lasx_castf32_f64(_rows0), 0b11011000));
+        __m256 _rs1 = lasx_castf64_f32(
+            lasx_permute4x64_f64(lasx_castf32_f64(_rows1), 0b11011000));
+        lsx_storeu_f32(rows0p + dx, lasx_castm256_m128(_rs0));
+        lsx_storeu_f32(rows1p + dx, lasx_castm256_m128(_rs1));
+
+        alphap += 8;
+      }
+#endif
+
+      // w_bound - w_out remain loop
+      for (; dx < w_out; dx++) {
+        const float* s0p = buffer0;
+        const float* s1p = buffer1;
+
+        float a0 = alphap[0];
+        float a1 = alphap[1];
+        rows0p[dx] = s0p[0] * a0 + s0p[1] * a1;
+        rows1p[dx] = s1p[0] * a0 + s1p[1] * a1;
+
+        alphap += 2;
+      }
+
+      float b0 = betap[0];
+      float b1 = betap[1];
+
+      // output pos
+      float* dp = dst + dy * w_out;
+
+      int nn = 0;
+
+#ifdef __loongarch_asx
+      // 8 float
+      __m256 _b0 = lasx_set1_f32(b0);
+      __m256 _b1 = lasx_set1_f32(b1);
+      // calculate and store results
+      for (; nn + 7 < w_out; nn += 8) {
+        __m256 _rows0 = lasx_loadu_f32(rows0p);
+        __m256 _rows1 = lasx_loadu_f32(rows1p);
+
+        __m256 _d = lasx_add_f32(lasx_mul_f32(_rows0, _b0),
+                                  lasx_mul_f32(_rows1, _b1));
+        lasx_storeu_f32(dp, _d);
+
+        dp += 8;
+        rows0p += 8;
+        rows1p += 8;
+      }
+
+      // 4 float
+      __m128 _c0 = lsx_set1_f32(b0);
+      __m128 _c1 = lsx_set1_f32(b1);
+      for (; nn + 3 < w_out; nn += 4) {
+        __m128 _rows0 = lsx_loadu_f32(rows0p);
+        __m128 _rows1 = lsx_loadu_f32(rows1p);
+
+        __m128 _d =
+            lsx_add_f32(lsx_mul_f32(_rows0, _c0), lsx_mul_f32(_rows1, _c1));
+        lsx_storeu_f32(dp, _d);
+
+        dp += 4;
+        rows0p += 4;
+        rows1p += 4;
+      }
+#endif
+
+      // calculate and store remain resluts
+      for (; nn < w_out; ++nn) {
+        *dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+      }
+      betap += 2;
+    }  // end h_bound loop
+
+    // h_bound - h_out loop
+    for (int dy = h_bound; dy < h_out; dy++) {
+      int sy = h_in - 1;
+      const float* s0 = src + sy * w_in;
+      const float* alphap = alpha;
+      float* rows0p = rows0;
+      float* rows1p = rows1;
+
+      int dx = 0;
+#ifdef __loongarch_asx
+      const float* s1 = s0;
+
+      // w_bound loop
+      for (; dx + 3 < w_bound; dx += 4) {
+        int x0 = xofs[dx * 2];
+        int x1 = xofs[(dx + 1) * 2];
+        int x2 = xofs[(dx + 2) * 2];
+        int x3 = xofs[(dx + 3) * 2];
+        int x01 = xofs[dx * 2 + 1];
+        int x11 = xofs[(dx + 1) * 2 + 1];
+        int x21 = xofs[(dx + 2) * 2 + 1];
+        int x31 = xofs[(dx + 3) * 2 + 1];
+
+        const float* s0p0 = s0 + x0;
+        const float* s0p1 = s0 + x1;
+        const float* s0p2 = s0 + x2;
+        const float* s0p3 = s0 + x3;
+
+        const float* s0p0_1 = s0 + x01;
+        const float* s0p1_1 = s0 + x11;
+        const float* s0p2_1 = s0 + x21;
+        const float* s0p3_1 = s0 + x31;
+
+        const float* s1p0 = s1 + x0;
+        const float* s1p1 = s1 + x1;
+        const float* s1p2 = s1 + x2;
+        const float* s1p3 = s1 + x3;
+
+        const float* s1p0_1 = s1 + x01;
+        const float* s1p1_1 = s1 + x11;
+        const float* s1p2_1 = s1 + x21;
+        const float* s1p3_1 = s1 + x31;
+
+        __m256 _a = lasx_loadu_f32(alphap);
+
+        __m256 _s0p0p3 = lasx_set_f32(
+            *s0p3_1, *s0p3, *s0p2_1, *s0p2, *s0p1_1, *s0p1, *s0p0_1, *s0p0);
+        __m256 _ms0 = lasx_mul_f32(_s0p0p3, _a);
+        __m256 _s1p0p3 = lasx_set_f32(
+            *s1p3_1, *s1p3, *s1p2_1, *s1p2, *s1p1_1, *s1p1, *s1p0_1, *s1p0);
+        __m256 _ms1 = lasx_mul_f32(_s1p0p3, _a);
+
+        __m256 _rows0 = lasx_hadd_f32(_ms0, _ms0);
+        __m256 _rows1 = lasx_hadd_f32(_ms1, _ms1);
+
+        __m256 _rs0 = lasx_castf64_f32(
+            lasx_permute4x64_f64(lasx_castf32_f64(_rows0), 0b11011000));
+        __m256 _rs1 = lasx_castf64_f32(
+            lasx_permute4x64_f64(lasx_castf32_f64(_rows1), 0b11011000));
+        lsx_storeu_f32(rows0p + dx, lasx_castm256_m128(_rs0));
+        lsx_storeu_f32(rows1p + dx, lasx_castm256_m128(_rs1));
+
+        alphap += 8;
+      }
+#endif
+
+      // w_bound remain loop
+      for (; dx < w_bound; ++dx) {
+        int sx = xofs[dx * 2];
+        int sx1 = xofs[dx * 2 + 1];
+        const float* s0p = s0 + sx;
+        const float* s0p1 = s0 + sx1;
+        float a0 = alphap[0];
+        float a1 = alphap[1];
+        rows0p[dx] = s0p[0] * a0 + s0p1[0] * a1;
+        rows1p[dx] = rows0p[dx];
+
+        alphap += 2;
+      }
+
+      float param = *(src + sy * w_in + w_in - 1);
+      const float buffer1[2] = {param, param};
+
+#ifdef __loongarch_asx
+      __m256 _s0p0p3 = lasx_set1_f32(param);
+      __m256 _s1p0p3 = lasx_set1_f32(param);
+
+      // w_bound - w_out loop
+      for (; dx + 3 < w_out; dx += 4) {
+        __m256 _a = lasx_loadu_f32(alphap);
+
+        __m256 _ms0 = lasx_mul_f32(_s0p0p3, _a);
+        __m256 _ms1 = lasx_mul_f32(_s1p0p3, _a);
+
+        __m256 _rows0 = lasx_hadd_f32(_ms0, _ms0);
+        __m256 _rows1 = lasx_hadd_f32(_ms1, _ms1);
+
+        __m256 _rs0 = lasx_castf64_f32(
+            lasx_permute4x64_f64(lasx_castf32_f64(_rows0), 0b11011000));
+        __m256 _rs1 = lasx_castf64_f32(
+            lasx_permute4x64_f64(lasx_castf32_f64(_rows1), 0b11011000));
+        lsx_storeu_f32(rows0p + dx, lasx_castm256_m128(_rs0));
+        lsx_storeu_f32(rows1p + dx, lasx_castm256_m128(_rs1));
+
+        alphap += 8;
+      }
+#endif
+
+      // w_bound - wout remain loop
+      for (; dx < w_out; dx++) {
+        const float* s0p = buffer1;
+        float a0 = alphap[0];
+        float a1 = alphap[1];
+        rows0p[dx] = s0p[0] * a0 + s0p[1] * a1;
+        rows1p[dx] = rows0p[dx];
+        alphap += 2;
+      }
+
+      float b0 = betap[0];
+      float b1 = betap[1];
+
+      float* dp = dst + dy * w_out;
+
+      int nn = 0;
+
+#ifdef __loongarch_asx
+      // 8 float
+      __m256 _b0 = lasx_set1_f32(b0);
+      __m256 _b1 = lasx_set1_f32(b1);
+      // calculate and store results
+      for (; nn + 7 < w_out; nn += 8) {
+        __m256 _rows0 = lasx_loadu_f32(rows0p);
+        __m256 _rows1 = lasx_loadu_f32(rows1p);
+
+        __m256 _d = lasx_add_f32(lasx_mul_f32(_rows0, _b0),
+                                  lasx_mul_f32(_rows1, _b1));
+        lasx_storeu_f32(dp, _d);
+
+        dp += 8;
+        rows0p += 8;
+        rows1p += 8;
+      }
+
+      // 4 float
+      __m128 _c0 = lsx_set1_f32(b0);
+      __m128 _c1 = lsx_set1_f32(b1);
+      for (; nn + 3 < w_out; nn += 4) {
+        __m128 _rows0 = lsx_loadu_f32(rows0p);
+        __m128 _rows1 = lsx_loadu_f32(rows1p);
+
+        __m128 _d =
+            lsx_add_f32(lsx_mul_f32(_rows0, _c0), lsx_mul_f32(_rows1, _c1));
+        lsx_storeu_f32(dp, _d);
+
+        dp += 4;
+        rows0p += 4;
+        rows1p += 4;
+      }
+#endif
+      // calculate and store remain results
+      for (; nn < w_out; ++nn) {
+        *dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+      }
+
+      betap += 2;
+    }  // end h_bound - h_out loop
+    lite::host::free(rowsbuf0);
+    lite::host::free(rowsbuf1);
+  }
+  lite::host::free(buf);
+}
+
+void nearest_interp(const float* input_data,
+                    float* output_data,
+                    const float ratio_h,
+                    const float ratio_w,
+                    const int n,
+                    const int c,
+                    const int in_h,
+                    const int in_w,
+                    const int out_h,
+                    const int out_w,
+                    const bool align_corners) {
+  int total_count = n * c;
+  if (align_corners) {
+    for (int i = 0; i < total_count; ++i) {
+      for (int h = 0; h < out_h; ++h) {
+        for (int w = 0; w < out_w; ++w) {
+          const float* input_data_ptr = input_data + i * in_h * in_w;
+          float* output_data_ptr =
+              output_data + i * out_h * out_w + h * out_w + w;
+          int near_y = static_cast<int>(ratio_h * h + 0.5);
+          int near_x = static_cast<int>(ratio_w * w + 0.5);
+          *output_data_ptr = input_data_ptr[near_y * in_w + near_x];
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < total_count; ++i) {
+      for (int h = 0; h < out_h; ++h) {
+        for (int w = 0; w < out_w; ++w) {
+          const float* input_data_ptr = input_data + i * in_h * in_w;
+          float* output_data_ptr =
+              output_data + i * out_h * out_w + h * out_w + w;
+          int near_y = static_cast<int>(ratio_h * h);
+          int near_x = static_cast<int>(ratio_w * w);
+          *output_data_ptr = input_data_ptr[near_y * in_w + near_x];
+        }
+      }
+    }
+  }
+}
+
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
+void interpolate(lite::Tensor* input,
+                 lite::Tensor* out_size,
+                 std::vector<const lite::Tensor*> list_new_size_tensor,
+                 lite::Tensor* scale_tensor,
+                 lite::Tensor* output,
+                 float scale,
+                 std::vector<float> scale_v,
+                 int out_h,
+                 int out_w,
+                 const int align_mode,
+                 const bool align_corners,
+                 const std::string interpolate_type) {
+  // format NCHW
+  int n = input->dims()[0];
+  int c = input->dims()[1];
+  int in_h = input->dims()[2];
+  int in_w = input->dims()[3];
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else if (scale_v.size() == 2) {
+    if (scale_v[0] > 0 && scale_v[1] > 0) {
+      out_h = static_cast<int>(in_h * scale_v[0]);
+      out_w = static_cast<int>(in_w * scale_v[1]);
+    }
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+  } else {
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+  }
+  output->Resize({n, c, out_h, out_w});
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  const float* input_data = input->data<float>();
+  float* output_data = output->mutable_data<float>();
+  if ("Bilinear" == interpolate_type) {
+    bilinear_interp(input_data,
+                    output_data,
+                    ratio_h,
+                    ratio_w,
+                    in_h,
+                    in_w,
+                    n,
+                    c,
+                    out_h,
+                    out_w,
+                    align_corners,
+                    align_mode);
+  } else if ("Nearest" == interpolate_type) {
+    nearest_interp(input_data,
+                   output_data,
+                   ratio_h,
+                   ratio_w,
+                   n,
+                   c,
+                   in_h,
+                   in_w,
+                   out_h,
+                   out_w,
+                   align_corners);
+  } else {
+    LOG(FATAL) << "Not supported interpolate_type: " << interpolate_type;
+  }
+}
+
+void interpolate_v2(lite::Tensor* input,
+                    lite::Tensor* out_size,
+                    std::vector<const lite::Tensor*> list_new_size_tensor,
+                    lite::Tensor* scale_tensor,
+                    lite::Tensor* output,
+                    float scale,
+                    std::vector<float> scale_v,
+                    int out_h,
+                    int out_w,
+                    const int align_mode,
+                    const bool align_corners,
+                    const std::string interpolate_type) {
+  // format NCHW
+  int n = input->dims()[0];
+  int c = input->dims()[1];
+  int in_h = input->dims()[2];
+  int in_w = input->dims()[3];
+  float scale_h = -1;
+  float scale_w = -1;
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_h = scale_data[0];
+        scale_w = scale_data[1];
+      } else {
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+    } else {
+      if (scale_v.size() > 1 && scale_v[0] > 0 && scale_v[1] > 0) {
+        scale_h = scale_v[0];
+        scale_w = scale_v[1];
+      }
+    }
+
+    if (scale_h > 0. && scale_w > 0.) {
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+  }
+  output->Resize({n, c, out_h, out_w});
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(new_scale_h);
+  }
+  if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(new_scale_w);
+  }
+
+  const float* input_data = input->data<float>();
+  float* output_data = output->mutable_data<float>();
+  if ("Bilinear" == interpolate_type) {
+    bilinear_interp(input_data,
+                    output_data,
+                    ratio_h,
+                    ratio_w,
+                    in_h,
+                    in_w,
+                    n,
+                    c,
+                    out_h,
+                    out_w,
+                    align_corners,
+                    align_mode);
+  } else if ("Nearest" == interpolate_type) {
+    nearest_interp(input_data,
+                   output_data,
+                   ratio_h,
+                   ratio_w,
+                   n,
+                   c,
+                   in_h,
+                   in_w,
+                   out_h,
+                   out_w,
+                   align_corners);
+  } else {
+    LOG(FATAL) << "Not supported interpolate_type: " << interpolate_type;
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/interpolate.h b/lite/backends/loongarch/math/interpolate.h
new file mode 100644
index 00000000000..ff0e2e037fd
--- /dev/null
+++ b/lite/backends/loongarch/math/interpolate.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void bilinear_interp(const float* input_data,
+                     float* output_data,
+                     const float ratio_h,
+                     const float ratio_w,
+                     const int in_h,
+                     const int in_w,
+                     const int n,
+                     const int c,
+                     const int out_h,
+                     const int out_w,
+                     const bool align_corners,
+                     const bool align_mode);
+
+void nearest_interp(const float* input_data,
+                    float* output_data,
+                    const float ratio_h,
+                    const float ratio_w,
+                    const int n,
+                    const int c,
+                    const int in_h,
+                    const int in_w,
+                    const int out_h,
+                    const int out_w,
+                    const bool align_corners);
+
+void interpolate(lite::Tensor* input,
+                 lite::Tensor* out_size,
+                 std::vector<const lite::Tensor*> list_new_size_tensor,
+                 lite::Tensor* scale_tensor,
+                 lite::Tensor* output,
+                 float scale,
+                 std::vector<float> scale_v,
+                 int out_h,
+                 int out_w,
+                 const int align_mode,
+                 const bool align_corners,
+                 const std::string interpolate_type);
+
+void interpolate_v2(lite::Tensor* input,
+                    lite::Tensor* out_size,
+                    std::vector<const lite::Tensor*> list_new_size_tensor,
+                    lite::Tensor* scale_tensor,
+                    lite::Tensor* output,
+                    float scale,
+                    std::vector<float> scale_v,
+                    int out_h,
+                    int out_w,
+                    const int align_mode,
+                    const bool align_corners,
+                    const std::string interpolate_type);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lasx/conv_depthwise_pack8.cc b/lite/backends/loongarch/math/lasx/conv_depthwise_pack8.cc
new file mode 100644
index 00000000000..1e7e2a0aff5
--- /dev/null
+++ b/lite/backends/loongarch/math/lasx/conv_depthwise_pack8.cc
@@ -0,0 +1,804 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/lasx/conv_depthwise_pack8.h"
+#include <vector>
+#include "lite/backends/loongarch/math/common/conv_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+// input  [bs, ic/8, ih, iw, 8]
+// filter [1,  oc/8, kh, kw, 8]
+// bias   [    oc             ]
+// output [bs, oc/8, oh, ow, 8]
+void conv_depthwise_3x3s1_m256(lite::Tensor* input,
+                               lite::Tensor* output,
+                               lite::Tensor* filter,
+                               lite::Tensor* bias,
+                               const bool has_act,
+                               const lite_api::ActivationType act_type,
+                               const operators::ActivationParam act_param) {
+  // input [bs, ic/8, ih, iw, 8]
+  CHECK_EQ(input->dims().size(), 5UL);
+  const int batch_size = input->dims()[0];
+  const int channel_num = input->dims()[1];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const float* input_data = input->data<float>();
+
+  // filter [1, oc/8, kh, kw, 8]
+  CHECK_EQ(filter->dims().size(), 5UL);
+  const int kernel_h = filter->dims()[2];
+  const int kernel_w = filter->dims()[3];
+  const float* filter_data = filter->data<float>();
+
+  // output [bs, oc/8, oh, ow, 8]
+  CHECK_EQ(output->dims().size(), 5UL);
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  float* output_data = output->mutable_data<float>();
+
+  const int input_group_step = input_width * 8;
+  const int input_channel_step = input_height * input_width * 8;
+  const int input_batch_step = channel_num * input_height * input_width * 8;
+
+  const int filter_channel_step = kernel_h * kernel_w * 8;
+
+  int total_count = batch_size * channel_num;
+
+  for (int idx = 0; idx < total_count; ++idx) {
+    __m256 _bias0 =
+        bias ? lasx_loadu_f32(bias->data<float>() + (idx % channel_num) * 8)
+             : lasx_set1_f32(0.f);
+
+    const float* k0 = filter_data + (idx % channel_num) * filter_channel_step;
+
+    const float* r0 = input_data + (idx / channel_num) * input_batch_step +
+                      (idx % channel_num) * input_channel_step;
+    const float* r1 = r0 + input_group_step;
+    const float* r2 = r1 + input_group_step;
+
+    __m256 _k00 = lasx_loadu_f32(k0);
+    __m256 _k01 = lasx_loadu_f32(k0 + 8);
+    __m256 _k02 = lasx_loadu_f32(k0 + 16);
+    __m256 _k10 = lasx_loadu_f32(k0 + 24);
+    __m256 _k11 = lasx_loadu_f32(k0 + 32);
+    __m256 _k12 = lasx_loadu_f32(k0 + 40);
+    __m256 _k20 = lasx_loadu_f32(k0 + 48);
+    __m256 _k21 = lasx_loadu_f32(k0 + 56);
+    __m256 _k22 = lasx_loadu_f32(k0 + 64);
+
+    for (int i = 0; i < output_height; ++i) {
+      int j = 0;
+      for (; j + 7 < output_width; j += 8) {
+        __m256 _sum0 = _bias0;
+
+        __m256 _r00 = lasx_loadu_f32(r0);
+        __m256 _r01 = lasx_loadu_f32(r0 + 8);
+        __m256 _r02 = lasx_loadu_f32(r0 + 16);
+        __m256 _r10 = lasx_loadu_f32(r1);
+        __m256 _r11 = lasx_loadu_f32(r1 + 8);
+        __m256 _r12 = lasx_loadu_f32(r1 + 16);
+        __m256 _r20 = lasx_loadu_f32(r2);
+        __m256 _r21 = lasx_loadu_f32(r2 + 8);
+        __m256 _r22 = lasx_loadu_f32(r2 + 16);
+
+        _sum0 = lasx_fmadd_f32(_k00, _r00, _sum0);
+        _sum0 = lasx_fmadd_f32(_k01, _r01, _sum0);
+        _sum0 = lasx_fmadd_f32(_k02, _r02, _sum0);
+        _sum0 = lasx_fmadd_f32(_k10, _r10, _sum0);
+        _sum0 = lasx_fmadd_f32(_k11, _r11, _sum0);
+        _sum0 = lasx_fmadd_f32(_k12, _r12, _sum0);
+        _sum0 = lasx_fmadd_f32(_k20, _r20, _sum0);
+        _sum0 = lasx_fmadd_f32(_k21, _r21, _sum0);
+        _sum0 = lasx_fmadd_f32(_k22, _r22, _sum0);
+
+        if (has_act) {
+          _sum0 = activation8_m256(_sum0, act_type, act_param);
+        }
+
+        lasx_storeu_f32(output_data, _sum0);
+
+        __m256 _sum1 = _bias0;
+        __m256 _r03 = lasx_loadu_f32(r0 + 24);
+        __m256 _r13 = lasx_loadu_f32(r1 + 24);
+        __m256 _r23 = lasx_loadu_f32(r2 + 24);
+
+        _sum1 = lasx_fmadd_f32(_k00, _r01, _sum1);
+        _sum1 = lasx_fmadd_f32(_k01, _r02, _sum1);
+        _sum1 = lasx_fmadd_f32(_k02, _r03, _sum1);
+        _sum1 = lasx_fmadd_f32(_k10, _r11, _sum1);
+        _sum1 = lasx_fmadd_f32(_k11, _r12, _sum1);
+        _sum1 = lasx_fmadd_f32(_k12, _r13, _sum1);
+        _sum1 = lasx_fmadd_f32(_k20, _r21, _sum1);
+        _sum1 = lasx_fmadd_f32(_k21, _r22, _sum1);
+        _sum1 = lasx_fmadd_f32(_k22, _r23, _sum1);
+
+        if (has_act) {
+          _sum1 = activation8_m256(_sum1, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 8, _sum1);
+
+        __m256 _sum2 = _bias0;
+        __m256 _r04 = lasx_loadu_f32(r0 + 32);
+        __m256 _r14 = lasx_loadu_f32(r1 + 32);
+        __m256 _r24 = lasx_loadu_f32(r2 + 32);
+
+        _sum2 = lasx_fmadd_f32(_k00, _r02, _sum2);
+        _sum2 = lasx_fmadd_f32(_k01, _r03, _sum2);
+        _sum2 = lasx_fmadd_f32(_k02, _r04, _sum2);
+        _sum2 = lasx_fmadd_f32(_k10, _r12, _sum2);
+        _sum2 = lasx_fmadd_f32(_k11, _r13, _sum2);
+        _sum2 = lasx_fmadd_f32(_k12, _r14, _sum2);
+        _sum2 = lasx_fmadd_f32(_k20, _r22, _sum2);
+        _sum2 = lasx_fmadd_f32(_k21, _r23, _sum2);
+        _sum2 = lasx_fmadd_f32(_k22, _r24, _sum2);
+
+        if (has_act) {
+          _sum2 = activation8_m256(_sum2, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 16, _sum2);
+
+        __m256 _sum3 = _bias0;
+        __m256 _r05 = lasx_loadu_f32(r0 + 40);
+        __m256 _r15 = lasx_loadu_f32(r1 + 40);
+        __m256 _r25 = lasx_loadu_f32(r2 + 40);
+
+        _sum3 = lasx_fmadd_f32(_k00, _r03, _sum3);
+        _sum3 = lasx_fmadd_f32(_k01, _r04, _sum3);
+        _sum3 = lasx_fmadd_f32(_k02, _r05, _sum3);
+        _sum3 = lasx_fmadd_f32(_k10, _r13, _sum3);
+        _sum3 = lasx_fmadd_f32(_k11, _r14, _sum3);
+        _sum3 = lasx_fmadd_f32(_k12, _r15, _sum3);
+        _sum3 = lasx_fmadd_f32(_k20, _r23, _sum3);
+        _sum3 = lasx_fmadd_f32(_k21, _r24, _sum3);
+        _sum3 = lasx_fmadd_f32(_k22, _r25, _sum3);
+
+        if (has_act) {
+          _sum3 = activation8_m256(_sum3, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 24, _sum3);
+
+        __m256 _sum4 = _bias0;
+        __m256 _r06 = lasx_loadu_f32(r0 + 48);
+        __m256 _r16 = lasx_loadu_f32(r1 + 48);
+        __m256 _r26 = lasx_loadu_f32(r2 + 48);
+
+        _sum4 = lasx_fmadd_f32(_k00, _r04, _sum4);
+        _sum4 = lasx_fmadd_f32(_k01, _r05, _sum4);
+        _sum4 = lasx_fmadd_f32(_k02, _r06, _sum4);
+        _sum4 = lasx_fmadd_f32(_k10, _r14, _sum4);
+        _sum4 = lasx_fmadd_f32(_k11, _r15, _sum4);
+        _sum4 = lasx_fmadd_f32(_k12, _r16, _sum4);
+        _sum4 = lasx_fmadd_f32(_k20, _r24, _sum4);
+        _sum4 = lasx_fmadd_f32(_k21, _r25, _sum4);
+        _sum4 = lasx_fmadd_f32(_k22, _r26, _sum4);
+
+        if (has_act) {
+          _sum4 = activation8_m256(_sum4, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 32, _sum4);
+
+        __m256 _sum5 = _bias0;
+        __m256 _r07 = lasx_loadu_f32(r0 + 56);
+        __m256 _r17 = lasx_loadu_f32(r1 + 56);
+        __m256 _r27 = lasx_loadu_f32(r2 + 56);
+
+        _sum5 = lasx_fmadd_f32(_k00, _r05, _sum5);
+        _sum5 = lasx_fmadd_f32(_k01, _r06, _sum5);
+        _sum5 = lasx_fmadd_f32(_k02, _r07, _sum5);
+        _sum5 = lasx_fmadd_f32(_k10, _r15, _sum5);
+        _sum5 = lasx_fmadd_f32(_k11, _r16, _sum5);
+        _sum5 = lasx_fmadd_f32(_k12, _r17, _sum5);
+        _sum5 = lasx_fmadd_f32(_k20, _r25, _sum5);
+        _sum5 = lasx_fmadd_f32(_k21, _r26, _sum5);
+        _sum5 = lasx_fmadd_f32(_k22, _r27, _sum5);
+
+        if (has_act) {
+          _sum5 = activation8_m256(_sum5, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 40, _sum5);
+
+        __m256 _sum6 = _bias0;
+        __m256 _r08 = lasx_loadu_f32(r0 + 64);
+        __m256 _r18 = lasx_loadu_f32(r1 + 64);
+        __m256 _r28 = lasx_loadu_f32(r2 + 64);
+
+        _sum6 = lasx_fmadd_f32(_k00, _r06, _sum6);
+        _sum6 = lasx_fmadd_f32(_k01, _r07, _sum6);
+        _sum6 = lasx_fmadd_f32(_k02, _r08, _sum6);
+        _sum6 = lasx_fmadd_f32(_k10, _r16, _sum6);
+        _sum6 = lasx_fmadd_f32(_k11, _r17, _sum6);
+        _sum6 = lasx_fmadd_f32(_k12, _r18, _sum6);
+        _sum6 = lasx_fmadd_f32(_k20, _r26, _sum6);
+        _sum6 = lasx_fmadd_f32(_k21, _r27, _sum6);
+        _sum6 = lasx_fmadd_f32(_k22, _r28, _sum6);
+
+        if (has_act) {
+          _sum6 = activation8_m256(_sum6, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 48, _sum6);
+
+        __m256 _sum7 = _bias0;
+        __m256 _r09 = lasx_loadu_f32(r0 + 72);
+        __m256 _r19 = lasx_loadu_f32(r1 + 72);
+        __m256 _r29 = lasx_loadu_f32(r2 + 72);
+
+        _sum7 = lasx_fmadd_f32(_k00, _r07, _sum7);
+        _sum7 = lasx_fmadd_f32(_k01, _r08, _sum7);
+        _sum7 = lasx_fmadd_f32(_k02, _r09, _sum7);
+        _sum7 = lasx_fmadd_f32(_k10, _r17, _sum7);
+        _sum7 = lasx_fmadd_f32(_k11, _r18, _sum7);
+        _sum7 = lasx_fmadd_f32(_k12, _r19, _sum7);
+        _sum7 = lasx_fmadd_f32(_k20, _r27, _sum7);
+        _sum7 = lasx_fmadd_f32(_k21, _r28, _sum7);
+        _sum7 = lasx_fmadd_f32(_k22, _r29, _sum7);
+
+        if (has_act) {
+          _sum7 = activation8_m256(_sum7, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 56, _sum7);
+
+        r0 += 64;
+        r1 += 64;
+        r2 += 64;
+        output_data += 64;
+      }
+      for (; j + 3 < output_width; j += 4) {
+        __m256 _sum0 = _bias0;
+
+        __m256 _r00 = lasx_loadu_f32(r0);
+        __m256 _r01 = lasx_loadu_f32(r0 + 8);
+        __m256 _r02 = lasx_loadu_f32(r0 + 16);
+        __m256 _r10 = lasx_loadu_f32(r1);
+        __m256 _r11 = lasx_loadu_f32(r1 + 8);
+        __m256 _r12 = lasx_loadu_f32(r1 + 16);
+        __m256 _r20 = lasx_loadu_f32(r2);
+        __m256 _r21 = lasx_loadu_f32(r2 + 8);
+        __m256 _r22 = lasx_loadu_f32(r2 + 16);
+
+        _sum0 = lasx_fmadd_f32(_k00, _r00, _sum0);
+        _sum0 = lasx_fmadd_f32(_k01, _r01, _sum0);
+        _sum0 = lasx_fmadd_f32(_k02, _r02, _sum0);
+        _sum0 = lasx_fmadd_f32(_k10, _r10, _sum0);
+        _sum0 = lasx_fmadd_f32(_k11, _r11, _sum0);
+        _sum0 = lasx_fmadd_f32(_k12, _r12, _sum0);
+        _sum0 = lasx_fmadd_f32(_k20, _r20, _sum0);
+        _sum0 = lasx_fmadd_f32(_k21, _r21, _sum0);
+        _sum0 = lasx_fmadd_f32(_k22, _r22, _sum0);
+
+        if (has_act) {
+          _sum0 = activation8_m256(_sum0, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data, _sum0);
+
+        __m256 _sum1 = _bias0;
+        __m256 _r03 = lasx_loadu_f32(r0 + 24);
+        __m256 _r13 = lasx_loadu_f32(r1 + 24);
+        __m256 _r23 = lasx_loadu_f32(r2 + 24);
+
+        _sum1 = lasx_fmadd_f32(_k00, _r01, _sum1);
+        _sum1 = lasx_fmadd_f32(_k01, _r02, _sum1);
+        _sum1 = lasx_fmadd_f32(_k02, _r03, _sum1);
+        _sum1 = lasx_fmadd_f32(_k10, _r11, _sum1);
+        _sum1 = lasx_fmadd_f32(_k11, _r12, _sum1);
+        _sum1 = lasx_fmadd_f32(_k12, _r13, _sum1);
+        _sum1 = lasx_fmadd_f32(_k20, _r21, _sum1);
+        _sum1 = lasx_fmadd_f32(_k21, _r22, _sum1);
+        _sum1 = lasx_fmadd_f32(_k22, _r23, _sum1);
+
+        if (has_act) {
+          _sum1 = activation8_m256(_sum1, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 8, _sum1);
+
+        __m256 _sum2 = _bias0;
+        __m256 _r04 = lasx_loadu_f32(r0 + 32);
+        __m256 _r14 = lasx_loadu_f32(r1 + 32);
+        __m256 _r24 = lasx_loadu_f32(r2 + 32);
+
+        _sum2 = lasx_fmadd_f32(_k00, _r02, _sum2);
+        _sum2 = lasx_fmadd_f32(_k01, _r03, _sum2);
+        _sum2 = lasx_fmadd_f32(_k02, _r04, _sum2);
+        _sum2 = lasx_fmadd_f32(_k10, _r12, _sum2);
+        _sum2 = lasx_fmadd_f32(_k11, _r13, _sum2);
+        _sum2 = lasx_fmadd_f32(_k12, _r14, _sum2);
+        _sum2 = lasx_fmadd_f32(_k20, _r22, _sum2);
+        _sum2 = lasx_fmadd_f32(_k21, _r23, _sum2);
+        _sum2 = lasx_fmadd_f32(_k22, _r24, _sum2);
+
+        if (has_act) {
+          _sum2 = activation8_m256(_sum2, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 16, _sum2);
+
+        __m256 _sum3 = _bias0;
+        __m256 _r05 = lasx_loadu_f32(r0 + 40);
+        __m256 _r15 = lasx_loadu_f32(r1 + 40);
+        __m256 _r25 = lasx_loadu_f32(r2 + 40);
+
+        _sum3 = lasx_fmadd_f32(_k00, _r03, _sum3);
+        _sum3 = lasx_fmadd_f32(_k01, _r04, _sum3);
+        _sum3 = lasx_fmadd_f32(_k02, _r05, _sum3);
+        _sum3 = lasx_fmadd_f32(_k10, _r13, _sum3);
+        _sum3 = lasx_fmadd_f32(_k11, _r14, _sum3);
+        _sum3 = lasx_fmadd_f32(_k12, _r15, _sum3);
+        _sum3 = lasx_fmadd_f32(_k20, _r23, _sum3);
+        _sum3 = lasx_fmadd_f32(_k21, _r24, _sum3);
+        _sum3 = lasx_fmadd_f32(_k22, _r25, _sum3);
+
+        if (has_act) {
+          _sum3 = activation8_m256(_sum3, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 24, _sum3);
+
+        r0 += 32;
+        r1 += 32;
+        r2 += 32;
+        output_data += 32;
+      }
+      for (; j + 1 < output_width; j += 2) {
+        __m256 _sum0 = _bias0;
+
+        __m256 _r00 = lasx_loadu_f32(r0);
+        __m256 _r01 = lasx_loadu_f32(r0 + 8);
+        __m256 _r02 = lasx_loadu_f32(r0 + 16);
+        __m256 _r10 = lasx_loadu_f32(r1);
+        __m256 _r11 = lasx_loadu_f32(r1 + 8);
+        __m256 _r12 = lasx_loadu_f32(r1 + 16);
+        __m256 _r20 = lasx_loadu_f32(r2);
+        __m256 _r21 = lasx_loadu_f32(r2 + 8);
+        __m256 _r22 = lasx_loadu_f32(r2 + 16);
+
+        _sum0 = lasx_fmadd_f32(_k00, _r00, _sum0);
+        _sum0 = lasx_fmadd_f32(_k01, _r01, _sum0);
+        _sum0 = lasx_fmadd_f32(_k02, _r02, _sum0);
+        _sum0 = lasx_fmadd_f32(_k10, _r10, _sum0);
+        _sum0 = lasx_fmadd_f32(_k11, _r11, _sum0);
+        _sum0 = lasx_fmadd_f32(_k12, _r12, _sum0);
+        _sum0 = lasx_fmadd_f32(_k20, _r20, _sum0);
+        _sum0 = lasx_fmadd_f32(_k21, _r21, _sum0);
+        _sum0 = lasx_fmadd_f32(_k22, _r22, _sum0);
+
+        if (has_act) {
+          _sum0 = activation8_m256(_sum0, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data, _sum0);
+
+        __m256 _sum1 = _bias0;
+        __m256 _r03 = lasx_loadu_f32(r0 + 24);
+        __m256 _r13 = lasx_loadu_f32(r1 + 24);
+        __m256 _r23 = lasx_loadu_f32(r2 + 24);
+
+        _sum1 = lasx_fmadd_f32(_k00, _r01, _sum1);
+        _sum1 = lasx_fmadd_f32(_k01, _r02, _sum1);
+        _sum1 = lasx_fmadd_f32(_k02, _r03, _sum1);
+        _sum1 = lasx_fmadd_f32(_k10, _r11, _sum1);
+        _sum1 = lasx_fmadd_f32(_k11, _r12, _sum1);
+        _sum1 = lasx_fmadd_f32(_k12, _r13, _sum1);
+        _sum1 = lasx_fmadd_f32(_k20, _r21, _sum1);
+        _sum1 = lasx_fmadd_f32(_k21, _r22, _sum1);
+        _sum1 = lasx_fmadd_f32(_k22, _r23, _sum1);
+
+        if (has_act) {
+          _sum1 = activation8_m256(_sum1, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data + 8, _sum1);
+
+        r0 += 16;
+        r1 += 16;
+        r2 += 16;
+        output_data += 16;
+      }
+      for (; j < output_width; ++j) {
+        __m256 _sum0 = _bias0;
+
+        __m256 _r00 = lasx_loadu_f32(r0);
+        __m256 _r01 = lasx_loadu_f32(r0 + 8);
+        __m256 _r02 = lasx_loadu_f32(r0 + 16);
+        __m256 _r10 = lasx_loadu_f32(r1);
+        __m256 _r11 = lasx_loadu_f32(r1 + 8);
+        __m256 _r12 = lasx_loadu_f32(r1 + 16);
+        __m256 _r20 = lasx_loadu_f32(r2);
+        __m256 _r21 = lasx_loadu_f32(r2 + 8);
+        __m256 _r22 = lasx_loadu_f32(r2 + 16);
+
+        _sum0 = lasx_fmadd_f32(_k00, _r00, _sum0);
+        _sum0 = lasx_fmadd_f32(_k01, _r01, _sum0);
+        _sum0 = lasx_fmadd_f32(_k02, _r02, _sum0);
+        _sum0 = lasx_fmadd_f32(_k10, _r10, _sum0);
+        _sum0 = lasx_fmadd_f32(_k11, _r11, _sum0);
+        _sum0 = lasx_fmadd_f32(_k12, _r12, _sum0);
+        _sum0 = lasx_fmadd_f32(_k20, _r20, _sum0);
+        _sum0 = lasx_fmadd_f32(_k21, _r21, _sum0);
+        _sum0 = lasx_fmadd_f32(_k22, _r22, _sum0);
+
+        if (has_act) {
+          _sum0 = activation8_m256(_sum0, act_type, act_param);
+        }
+        lasx_storeu_f32(output_data, _sum0);
+
+        r0 += 8;
+        r1 += 8;
+        r2 += 8;
+        output_data += 8;
+      }
+      r0 += 2 * 8;
+      r1 += 2 * 8;
+      r2 += 2 * 8;
+    }  // end of for output_height
+  }    // end of for batch_size * channel_num
+}
+
+// input  [bs, ic/8, ih, iw, 8]
+// filter [1,  oc/8, kh, kw, 8]
+// bias   [    oc             ]
+// output [bs, oc/8, oh, ow, 8]
+void conv_depthwise_3x3s2_m256(lite::Tensor* input,
+                               lite::Tensor* output,
+                               lite::Tensor* filter,
+                               lite::Tensor* bias,
+                               const bool has_act,
+                               const lite_api::ActivationType act_type,
+                               const operators::ActivationParam act_param) {
+  // input [bs, ic/8, ih, iw, 8]
+  CHECK_EQ(input->dims().size(), 5UL);
+  const int batch_size = input->dims()[0];
+  const int channel_num = input->dims()[1];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const float* input_data = input->data<float>();
+
+  // filter [1, oc/8, kh, kw, 8]
+  CHECK_EQ(filter->dims().size(), 5UL);
+  const int kernel_h = filter->dims()[2];
+  const int kernel_w = filter->dims()[3];
+  const float* filter_data = filter->data<float>();
+
+  // output [bs, oc/8, oh, ow, 8]
+  CHECK_EQ(output->dims().size(), 5UL);
+  const int output_height = output->dims()[2];  // 2
+  const int output_width = output->dims()[3];   // 2
+  float* output_data = output->mutable_data<float>();
+
+  const int input_group_step = input_width * 8;
+  const int input_channel_step = input_height * input_width * 8;
+  const int input_batch_step = channel_num * input_height * input_width * 8;
+
+  const int filter_channel_step = kernel_h * kernel_w * 8;
+
+  const int tailstep = (input_width - 2 * output_width + input_width) * 8;
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      __m256 _bias0 = bias ? lasx_loadu_f32(bias->data<float>() + ic * 8)
+                           : lasx_set1_f32(0.f);
+
+      const float* k0 = filter_data + ic * filter_channel_step;
+
+      const float* r0 =
+          input_data + bs * input_batch_step + ic * input_channel_step;
+      const float* r1 = r0 + input_group_step;
+      const float* r2 = r1 + input_group_step;
+
+      __m256 _k00 = lasx_loadu_f32(k0);
+      __m256 _k01 = lasx_loadu_f32(k0 + 8);
+      __m256 _k02 = lasx_loadu_f32(k0 + 16);
+      __m256 _k10 = lasx_loadu_f32(k0 + 24);
+      __m256 _k11 = lasx_loadu_f32(k0 + 32);
+      __m256 _k12 = lasx_loadu_f32(k0 + 40);
+      __m256 _k20 = lasx_loadu_f32(k0 + 48);
+      __m256 _k21 = lasx_loadu_f32(k0 + 56);
+      __m256 _k22 = lasx_loadu_f32(k0 + 64);
+
+      for (int i = 0; i < output_height; ++i) {
+        int j = 0;
+        for (; j + 3 < output_width; j += 4) {
+          __m256 _sum0 = _bias0;
+
+          __m256 _r00 = lasx_loadu_f32(r0);
+          __m256 _r01 = lasx_loadu_f32(r0 + 8);
+          __m256 _r02 = lasx_loadu_f32(r0 + 16);
+          __m256 _r10 = lasx_loadu_f32(r1);
+          __m256 _r11 = lasx_loadu_f32(r1 + 8);
+          __m256 _r12 = lasx_loadu_f32(r1 + 16);
+          __m256 _r20 = lasx_loadu_f32(r2);
+          __m256 _r21 = lasx_loadu_f32(r2 + 8);
+          __m256 _r22 = lasx_loadu_f32(r2 + 16);
+
+          _sum0 = lasx_fmadd_f32(_k00, _r00, _sum0);
+          _sum0 = lasx_fmadd_f32(_k01, _r01, _sum0);
+          _sum0 = lasx_fmadd_f32(_k02, _r02, _sum0);
+          _sum0 = lasx_fmadd_f32(_k10, _r10, _sum0);
+          _sum0 = lasx_fmadd_f32(_k11, _r11, _sum0);
+          _sum0 = lasx_fmadd_f32(_k12, _r12, _sum0);
+          _sum0 = lasx_fmadd_f32(_k20, _r20, _sum0);
+          _sum0 = lasx_fmadd_f32(_k21, _r21, _sum0);
+          _sum0 = lasx_fmadd_f32(_k22, _r22, _sum0);
+
+          if (has_act) {
+            _sum0 = activation8_m256(_sum0, act_type, act_param);
+          }
+          lasx_storeu_f32(output_data, _sum0);
+
+          __m256 _sum1 = _bias0;
+          __m256 _r03 = lasx_loadu_f32(r0 + 24);
+          __m256 _r13 = lasx_loadu_f32(r1 + 24);
+          __m256 _r23 = lasx_loadu_f32(r2 + 24);
+          __m256 _r04 = lasx_loadu_f32(r0 + 32);
+          __m256 _r14 = lasx_loadu_f32(r1 + 32);
+          __m256 _r24 = lasx_loadu_f32(r2 + 32);
+
+          _sum1 = lasx_fmadd_f32(_k00, _r02, _sum1);
+          _sum1 = lasx_fmadd_f32(_k01, _r03, _sum1);
+          _sum1 = lasx_fmadd_f32(_k02, _r04, _sum1);
+          _sum1 = lasx_fmadd_f32(_k10, _r12, _sum1);
+          _sum1 = lasx_fmadd_f32(_k11, _r13, _sum1);
+          _sum1 = lasx_fmadd_f32(_k12, _r14, _sum1);
+          _sum1 = lasx_fmadd_f32(_k20, _r22, _sum1);
+          _sum1 = lasx_fmadd_f32(_k21, _r23, _sum1);
+          _sum1 = lasx_fmadd_f32(_k22, _r24, _sum1);
+
+          if (has_act) {
+            _sum1 = activation8_m256(_sum1, act_type, act_param);
+          }
+          lasx_storeu_f32(output_data + 8, _sum1);
+
+          __m256 _sum2 = _bias0;
+          __m256 _r05 = lasx_loadu_f32(r0 + 40);
+          __m256 _r15 = lasx_loadu_f32(r1 + 40);
+          __m256 _r25 = lasx_loadu_f32(r2 + 40);
+          __m256 _r06 = lasx_loadu_f32(r0 + 48);
+          __m256 _r16 = lasx_loadu_f32(r1 + 48);
+          __m256 _r26 = lasx_loadu_f32(r2 + 48);
+
+          _sum2 = lasx_fmadd_f32(_k00, _r04, _sum2);
+          _sum2 = lasx_fmadd_f32(_k01, _r05, _sum2);
+          _sum2 = lasx_fmadd_f32(_k02, _r06, _sum2);
+          _sum2 = lasx_fmadd_f32(_k10, _r14, _sum2);
+          _sum2 = lasx_fmadd_f32(_k11, _r15, _sum2);
+          _sum2 = lasx_fmadd_f32(_k12, _r16, _sum2);
+          _sum2 = lasx_fmadd_f32(_k20, _r24, _sum2);
+          _sum2 = lasx_fmadd_f32(_k21, _r25, _sum2);
+          _sum2 = lasx_fmadd_f32(_k22, _r26, _sum2);
+
+          if (has_act) {
+            _sum2 = activation8_m256(_sum2, act_type, act_param);
+          }
+          lasx_storeu_f32(output_data + 16, _sum2);
+
+          __m256 _sum3 = _bias0;
+          __m256 _r07 = lasx_loadu_f32(r0 + 56);
+          __m256 _r17 = lasx_loadu_f32(r1 + 56);
+          __m256 _r27 = lasx_loadu_f32(r2 + 56);
+          __m256 _r08 = lasx_loadu_f32(r0 + 64);
+          __m256 _r18 = lasx_loadu_f32(r1 + 64);
+          __m256 _r28 = lasx_loadu_f32(r2 + 64);
+
+          _sum3 = lasx_fmadd_f32(_k00, _r06, _sum3);
+          _sum3 = lasx_fmadd_f32(_k01, _r07, _sum3);
+          _sum3 = lasx_fmadd_f32(_k02, _r08, _sum3);
+          _sum3 = lasx_fmadd_f32(_k10, _r16, _sum3);
+          _sum3 = lasx_fmadd_f32(_k11, _r17, _sum3);
+          _sum3 = lasx_fmadd_f32(_k12, _r18, _sum3);
+          _sum3 = lasx_fmadd_f32(_k20, _r26, _sum3);
+          _sum3 = lasx_fmadd_f32(_k21, _r27, _sum3);
+          _sum3 = lasx_fmadd_f32(_k22, _r28, _sum3);
+
+          if (has_act) {
+            _sum3 = activation8_m256(_sum3, act_type, act_param);
+          }
+          lasx_storeu_f32(output_data + 24, _sum3);
+
+          r0 += 2 * 32;
+          r1 += 2 * 32;
+          r2 += 2 * 32;
+          output_data += 32;
+        }
+        for (; j + 1 < output_width; j += 2) {
+          __m256 _sum0 = _bias0;
+
+          __m256 _r00 = lasx_loadu_f32(r0);
+          __m256 _r01 = lasx_loadu_f32(r0 + 8);
+          __m256 _r02 = lasx_loadu_f32(r0 + 16);
+          __m256 _r10 = lasx_loadu_f32(r1);
+          __m256 _r11 = lasx_loadu_f32(r1 + 8);
+          __m256 _r12 = lasx_loadu_f32(r1 + 16);
+          __m256 _r20 = lasx_loadu_f32(r2);
+          __m256 _r21 = lasx_loadu_f32(r2 + 8);
+          __m256 _r22 = lasx_loadu_f32(r2 + 16);
+
+          _sum0 = lasx_fmadd_f32(_k00, _r00, _sum0);
+          _sum0 = lasx_fmadd_f32(_k01, _r01, _sum0);
+          _sum0 = lasx_fmadd_f32(_k02, _r02, _sum0);
+          _sum0 = lasx_fmadd_f32(_k10, _r10, _sum0);
+          _sum0 = lasx_fmadd_f32(_k11, _r11, _sum0);
+          _sum0 = lasx_fmadd_f32(_k12, _r12, _sum0);
+          _sum0 = lasx_fmadd_f32(_k20, _r20, _sum0);
+          _sum0 = lasx_fmadd_f32(_k21, _r21, _sum0);
+          _sum0 = lasx_fmadd_f32(_k22, _r22, _sum0);
+
+          if (has_act) {
+            _sum0 = activation8_m256(_sum0, act_type, act_param);
+          }
+          lasx_storeu_f32(output_data, _sum0);
+
+          __m256 _sum1 = _bias0;
+          __m256 _r03 = lasx_loadu_f32(r0 + 24);
+          __m256 _r13 = lasx_loadu_f32(r1 + 24);
+          __m256 _r23 = lasx_loadu_f32(r2 + 24);
+          __m256 _r04 = lasx_loadu_f32(r0 + 32);
+          __m256 _r14 = lasx_loadu_f32(r1 + 32);
+          __m256 _r24 = lasx_loadu_f32(r2 + 32);
+
+          _sum1 = lasx_fmadd_f32(_k00, _r02, _sum1);
+          _sum1 = lasx_fmadd_f32(_k01, _r03, _sum1);
+          _sum1 = lasx_fmadd_f32(_k02, _r04, _sum1);
+          _sum1 = lasx_fmadd_f32(_k10, _r12, _sum1);
+          _sum1 = lasx_fmadd_f32(_k11, _r13, _sum1);
+          _sum1 = lasx_fmadd_f32(_k12, _r14, _sum1);
+          _sum1 = lasx_fmadd_f32(_k20, _r22, _sum1);
+          _sum1 = lasx_fmadd_f32(_k21, _r23, _sum1);
+          _sum1 = lasx_fmadd_f32(_k22, _r24, _sum1);
+
+          if (has_act) {
+            _sum1 = activation8_m256(_sum1, act_type, act_param);
+          }
+          lasx_storeu_f32(output_data + 8, _sum1);
+
+          r0 += 2 * 16;
+          r1 += 2 * 16;
+          r2 += 2 * 16;
+          output_data += 16;
+        }
+        for (; j < output_width; j++) {
+          __m256 _sum0 = _bias0;
+
+          __m256 _r00 = lasx_loadu_f32(r0);
+          __m256 _r01 = lasx_loadu_f32(r0 + 8);
+          __m256 _r02 = lasx_loadu_f32(r0 + 16);
+          __m256 _r10 = lasx_loadu_f32(r1);
+          __m256 _r11 = lasx_loadu_f32(r1 + 8);
+          __m256 _r12 = lasx_loadu_f32(r1 + 16);
+          __m256 _r20 = lasx_loadu_f32(r2);
+          __m256 _r21 = lasx_loadu_f32(r2 + 8);
+          __m256 _r22 = lasx_loadu_f32(r2 + 16);
+
+          _sum0 = lasx_fmadd_f32(_k00, _r00, _sum0);
+          _sum0 = lasx_fmadd_f32(_k01, _r01, _sum0);
+          _sum0 = lasx_fmadd_f32(_k02, _r02, _sum0);
+          _sum0 = lasx_fmadd_f32(_k10, _r10, _sum0);
+          _sum0 = lasx_fmadd_f32(_k11, _r11, _sum0);
+          _sum0 = lasx_fmadd_f32(_k12, _r12, _sum0);
+          _sum0 = lasx_fmadd_f32(_k20, _r20, _sum0);
+          _sum0 = lasx_fmadd_f32(_k21, _r21, _sum0);
+          _sum0 = lasx_fmadd_f32(_k22, _r22, _sum0);
+
+          if (has_act) {
+            _sum0 = activation8_m256(_sum0, act_type, act_param);
+          }
+          lasx_storeu_f32(output_data, _sum0);
+
+          r0 += 2 * 8;
+          r1 += 2 * 8;
+          r2 += 2 * 8;
+          output_data += 8;
+        }
+        r0 += tailstep;
+        r1 += tailstep;
+        r2 += tailstep;
+      }  // end of for output_height
+    }    // end of for channel_num
+  }      // end of for batch_size
+}
+
+// input  [bs, ic/8, ih, iw, 8]
+// filter [1,  oc/8, kh, kw, 8]
+// bias   [    oc             ]
+// output [bs, oc/8, oh, ow, 8]
+void conv_depthwise_m256(lite::Tensor* input,
+                         lite::Tensor* output,
+                         lite::Tensor* filter,
+                         lite::Tensor* bias,
+                         const int stride_h,
+                         const int stride_w,
+                         const int dilation_h,
+                         const int dilation_w,
+                         const bool has_act,
+                         const lite_api::ActivationType act_type,
+                         const operators::ActivationParam act_param) {
+  // input [bs, ic/8, ih, iw, 8]
+  CHECK_EQ(input->dims().size(), 5UL);
+  const int batch_size = input->dims()[0];
+  const int channel_num = input->dims()[1];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const float* input_data = input->data<float>();
+
+  // filter [1, oc/8, kh, kw, 8]
+  CHECK_EQ(filter->dims().size(), 5UL);
+  const int kernel_h = filter->dims()[2];
+  const int kernel_w = filter->dims()[3];
+  const float* filter_data = filter->data<float>();
+
+  // output [bs, oc/8, oh, ow, 8]
+  CHECK_EQ(output->dims().size(), 5UL);
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  float* output_data = output->mutable_data<float>();
+
+  const int input_group_step = input_width * 8 * stride_h;
+  const int input_channel_step = input_height * input_width * 8;
+  const int input_batch_step = channel_num * input_height * input_width * 8;
+
+  const int filter_kernel_size = kernel_h * kernel_w;
+  const int filter_channel_step = kernel_h * kernel_w * 8;
+
+  // kernel offsets
+  std::vector<int> _space_ofs(filter_kernel_size);
+  int* space_ofs = &_space_ofs[0];
+  {
+    int p1 = 0;
+    int p2 = 0;
+    int gap = input_width * dilation_h - kernel_w * dilation_w;
+    for (int i = 0; i < kernel_h; i++) {
+      for (int j = 0; j < kernel_w; j++) {
+        space_ofs[p1++] = p2;
+        p2 += dilation_w;
+      }
+      p2 += gap;
+    }
+  }
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      const float* input_ptr =
+          input_data + bs * input_batch_step + ic * input_channel_step;
+      const float* filter_ptr = filter_data + ic * filter_channel_step;
+      for (int i = 0; i < output_height; ++i) {
+        for (int j = 0; j < output_width; ++j) {
+          __m256 _sum = lasx_set1_f32(0.f);
+
+          if (bias) {
+            _sum = lasx_loadu_f32((bias->data<float>()) + ic * 8);
+          }
+
+          const float* start_ptr =
+              input_ptr + i * input_group_step + j * 8 * stride_w;
+
+          for (int k = 0; k < filter_kernel_size; k++) {
+            __m256 _input = lasx_loadu_f32(start_ptr + +space_ofs[k] * 8);
+            __m256 _filter = lasx_loadu_f32(filter_ptr + k * 8);
+            _sum = lasx_fmadd_f32(_input, _filter, _sum);
+          }
+
+          if (has_act) {
+            _sum = activation8_m256(_sum, act_type, act_param);
+          }
+
+          lasx_storeu_f32(output_data, _sum);
+          output_data += 8;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lasx/conv_depthwise_pack8.h b/lite/backends/loongarch/math/lasx/conv_depthwise_pack8.h
new file mode 100644
index 00000000000..8d7d50a361c
--- /dev/null
+++ b/lite/backends/loongarch/math/lasx/conv_depthwise_pack8.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void conv_depthwise_3x3s1_m256(lite::Tensor* input,
+                               lite::Tensor* output,
+                               lite::Tensor* filter,
+                               lite::Tensor* bias,
+                               const bool has_act,
+                               const lite_api::ActivationType act_type,
+                               const operators::ActivationParam act_param);
+
+void conv_depthwise_3x3s2_m256(lite::Tensor* input,
+                               lite::Tensor* output,
+                               lite::Tensor* filter,
+                               lite::Tensor* bias,
+                               const bool has_act,
+                               const lite_api::ActivationType act_type,
+                               const operators::ActivationParam act_param);
+
+void conv_depthwise_m256(lite::Tensor* input,
+                         lite::Tensor* output,
+                         lite::Tensor* filter,
+                         lite::Tensor* bias,
+                         const int stride_h,
+                         const int stride_w,
+                         const int dilation_h,
+                         const int dilation_w,
+                         const bool has_act,
+                         const lite_api::ActivationType act_type,
+                         const operators::ActivationParam act_param);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lasx/lasx_mathfuns.cc b/lite/backends/loongarch/math/lasx/lasx_mathfuns.cc
new file mode 100644
index 00000000000..dfb92ad71ae
--- /dev/null
+++ b/lite/backends/loongarch/math/lasx/lasx_mathfuns.cc
@@ -0,0 +1,527 @@
+//  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+   lasx implementation of sin, cos, sincos, exp and log
+
+   Based on "lite/backends/x86"
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/* declare some LASX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val)                                   \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val)                                  \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val)                       \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4);
+_PS256_CONST(cephes_log_q2, 0.693359375);
+
+/* natural logarithm computed for 8 simultaneous float
+   return NaN for x <= 0
+*/
+v8sf log256_ps(v8sf x) {
+  v8si imm0;
+  v8sf one = *(v8sf *)_ps256_1;  // NOLINT
+
+  v8sf invalid_mask = lasx_xvfcmp_sle_s(x, lasx_setzero_f32());
+
+  x = lasx_max_f32(x, *(v8sf *)_ps256_min_norm_pos);  // NOLINT
+  /* cut off denormalized stuff */                     // NOLINT
+
+  // can be done with LASX
+  imm0 = lasx_srli_i32(lasx_castf32_m256i(x), 23);
+
+  /* keep only the fractional part */
+  x = lasx_and_f32(x, *(v8sf *)_ps256_inv_mant_mask);  // NOLINT
+  x = lasx_or_f32(x, *(v8sf *)_ps256_0p5);             // NOLINT
+
+  // this is again another LASX instruction
+  imm0 = lasx_sub_i32(imm0, *(v8si *)_pi32_256_0x7f);  // NOLINT
+  v8sf e = lasx_cvti32_f32(imm0);
+
+  e = lasx_add_f32(e, one);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v8sf mask =
+      lasx_xvfcmp_slt_s(x, *(v8sf *)_ps256_cephes_SQRTHF);  // NOLINT
+  v8sf tmp = lasx_and_f32(x, mask);
+  x = lasx_sub_f32(x, one);
+  e = lasx_sub_f32(e, lasx_and_f32(one, mask));
+  x = lasx_add_f32(x, tmp);
+
+  v8sf z = lasx_mul_f32(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_log_p1);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_log_p2);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_log_p3);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_log_p4);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_log_p5);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_log_p6);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_log_p7);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_log_p8);  // NOLINT
+  y = lasx_mul_f32(y, x);
+
+  y = lasx_mul_f32(y, z);
+
+  tmp = lasx_mul_f32(e, *(v8sf *)_ps256_cephes_log_q1);  // NOLINT
+  y = lasx_add_f32(y, tmp);
+
+  tmp = lasx_mul_f32(z, *(v8sf *)_ps256_0p5);  // NOLINT
+  y = lasx_sub_f32(y, tmp);
+
+  tmp = lasx_mul_f32(e, *(v8sf *)_ps256_cephes_log_q2);  // NOLINT
+  x = lasx_add_f32(x, y);
+  x = lasx_add_f32(x, tmp);
+  x = lasx_or_f32(x, invalid_mask);  // negative arg will be NAN
+  return x;
+}
+
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS256_CONST(cephes_exp_C1, 0.693359375);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v8sf exp256_ps(v8sf x) {
+  v8sf tmp = lasx_setzero_f32(), fx;
+  v8si imm0;
+  v8sf one = *(v8sf *)_ps256_1;  // NOLINT
+
+  x = lasx_min_f32(x, *(v8sf *)_ps256_exp_hi);  // NOLINT
+  x = lasx_max_f32(x, *(v8sf *)_ps256_exp_lo);  // NOLINT
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = lasx_mul_f32(x, *(v8sf *)_ps256_cephes_LOG2EF);  // NOLINT
+  fx = lasx_add_f32(fx, *(v8sf *)_ps256_0p5);           // NOLINT
+
+  // imm0 = lasx_cvttf32_i32(fx);
+  // tmp  = lasx_cvti32_f32(imm0);
+
+  tmp = lasx_floor_f32(fx);
+
+  /* if greater, substract 1 */
+  v8sf mask = lasx_xvfcmp_slt_s(fx, tmp);
+  mask = lasx_and_f32(mask, one);
+  fx = lasx_sub_f32(tmp, mask);
+
+  tmp = lasx_mul_f32(fx, *(v8sf *)_ps256_cephes_exp_C1);     // NOLINT
+  v8sf z = lasx_mul_f32(fx, *(v8sf *)_ps256_cephes_exp_C2);  // NOLINT
+  x = lasx_sub_f32(x, tmp);
+  x = lasx_sub_f32(x, z);
+
+  z = lasx_mul_f32(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_exp_p1);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_exp_p2);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_exp_p3);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_exp_p4);  // NOLINT
+  y = lasx_mul_f32(y, x);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_cephes_exp_p5);  // NOLINT
+  y = lasx_mul_f32(y, z);
+  y = lasx_add_f32(y, x);
+  y = lasx_add_f32(y, one);
+
+  /* build 2^n */
+  imm0 = lasx_cvttf32_i32(fx);
+  // another two LASX instructions
+  imm0 = lasx_add_i32(imm0, *(v8si *)_pi32_256_0x7f);  // NOLINT
+  imm0 = lasx_slli_i32(imm0, 23);
+  v8sf pow2n = lasx_castm256i_f32(imm0);
+  y = lasx_mul_f32(y, pow2n);
+  return y;
+}
+
+v8sf pow256_ps(v8sf a, v8sf b) {
+  // pow(x, m) = exp(m * log(x))
+  v8sf vsum = exp256_ps(lasx_mul_f32(b, log256_ps(a)));
+  return vsum;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS256_CONST(sincof_p0, -1.9515295891E-4);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
+_PS256_CONST(sincof_p2, -1.6666654611E-1);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003);
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
+
+/* evaluation of 8 sines at onces using LASX intrisics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+v8sf sin256_ps(v8sf x) {  // any x
+  v8sf xmm1, xmm2 = lasx_setzero_f32(), xmm3, sign_bit, y;
+  v8si imm0, imm2;
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = lasx_and_f32(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT
+  /* extract the sign bit (upper one) */
+  sign_bit = lasx_and_f32(sign_bit, *(v8sf *)_ps256_sign_mask);  // NOLINT
+
+  /* scale by 4/Pi */
+  y = lasx_mul_f32(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT
+
+  /* store the integer part of y in mm0 */
+  imm2 = lasx_cvttf32_i32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  // another two LASX instruction
+  imm2 = lasx_add_i32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = lasx_and_m256i(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT
+  y = lasx_cvti32_f32(imm2);
+
+  /* get the swap sign flag */
+  imm0 = lasx_and_m256i(imm2, *(v8si *)_pi32_256_4);  // NOLINT
+  imm0 = lasx_slli_i32(imm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  imm2 = lasx_and_m256i(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = lasx_cmpeq_i32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
+
+  v8sf swap_sign_bit = lasx_castm256i_f32(imm0);
+  v8sf poly_mask = lasx_castm256i_f32(imm2);
+  sign_bit = lasx_xor_f32(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
+  xmm1 = lasx_mul_f32(y, xmm1);
+  xmm2 = lasx_mul_f32(y, xmm2);
+  xmm3 = lasx_mul_f32(y, xmm3);
+  x = lasx_add_f32(x, xmm1);
+  x = lasx_add_f32(x, xmm2);
+  x = lasx_add_f32(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT
+  v8sf z = lasx_mul_f32(x, x);
+
+  y = lasx_mul_f32(y, z);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
+  y = lasx_mul_f32(y, z);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
+  y = lasx_mul_f32(y, z);
+  y = lasx_mul_f32(y, z);
+  v8sf tmp = lasx_mul_f32(z, *(v8sf *)_ps256_0p5);  // NOLINT
+  y = lasx_sub_f32(y, tmp);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_1);  // NOLINT
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_add_f32(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_add_f32(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_mul_f32(y2, x);
+  y2 = lasx_add_f32(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = lasx_and_f32(xmm3, y2);  //, xmm3);
+  y = lasx_andnot_f32(xmm3, y);
+  y = lasx_add_f32(y, y2);
+  /* update the sign */
+  y = lasx_xor_f32(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+v8sf cos256_ps(v8sf x) {  // any x
+  v8sf xmm1, xmm2 = lasx_setzero_f32(), xmm3, y;
+  v8si imm0, imm2;
+
+  /* take the absolute value */
+  x = lasx_and_f32(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT
+
+  /* scale by 4/Pi */
+  y = lasx_mul_f32(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT
+
+  /* store the integer part of y in mm0 */
+  imm2 = lasx_cvttf32_i32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = lasx_add_i32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = lasx_and_m256i(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT
+  y = lasx_cvti32_f32(imm2);
+  imm2 = lasx_sub_i32(imm2, *(v8si *)_pi32_256_2);  // NOLINT
+
+  /* get the swap sign flag */
+  imm0 = lasx_andnot_m256i(imm2, *(v8si *)_pi32_256_4);  // NOLINT
+  imm0 = lasx_slli_i32(imm0, 29);
+  /* get the polynom selection mask */
+  imm2 = lasx_and_m256i(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = lasx_cmpeq_i32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
+
+  v8sf sign_bit = lasx_castm256i_f32(imm0);
+  v8sf poly_mask = lasx_castm256i_f32(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
+  xmm1 = lasx_mul_f32(y, xmm1);
+  xmm2 = lasx_mul_f32(y, xmm2);
+  xmm3 = lasx_mul_f32(y, xmm3);
+  x = lasx_add_f32(x, xmm1);
+  x = lasx_add_f32(x, xmm2);
+  x = lasx_add_f32(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT
+  v8sf z = lasx_mul_f32(x, x);
+
+  y = lasx_mul_f32(y, z);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
+  y = lasx_mul_f32(y, z);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
+  y = lasx_mul_f32(y, z);
+  y = lasx_mul_f32(y, z);
+  v8sf tmp = lasx_mul_f32(z, *(v8sf *)_ps256_0p5);  // NOLINT
+  y = lasx_sub_f32(y, tmp);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_1);  // NOLINT
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_add_f32(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_add_f32(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_mul_f32(y2, x);
+  y2 = lasx_add_f32(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = lasx_and_f32(xmm3, y2);  //, xmm3);
+  y = lasx_andnot_f32(xmm3, y);
+  y = lasx_add_f32(y, y2);
+  /* update the sign */
+  y = lasx_xor_f32(y, sign_bit);
+
+  return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
+   replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
+  v8sf xmm1, xmm2, xmm3 = lasx_setzero_f32(), sign_bit_sin, y;
+  v8si imm0, imm2, imm4;
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = lasx_and_f32(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT
+  /* extract the sign bit (upper one) */
+  sign_bit_sin =
+      lasx_and_f32(sign_bit_sin, *(v8sf *)_ps256_sign_mask);  // NOLINT
+
+  /* scale by 4/Pi */
+  y = lasx_mul_f32(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT
+
+  /* store the integer part of y in imm2 */
+  imm2 = lasx_cvttf32_i32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = lasx_add_i32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = lasx_and_m256i(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT
+
+  y = lasx_cvti32_f32(imm2);
+  imm4 = imm2;
+
+  /* get the swap sign flag for the sine */
+  imm0 = lasx_and_m256i(imm2, *(v8si *)_pi32_256_4);  // NOLINT
+  imm0 = lasx_slli_i32(imm0, 29);
+  // v8sf swap_sign_bit_sin = lasx_castm256i_f32(imm0);
+
+  /* get the polynom selection mask for the sine*/
+  imm2 = lasx_and_m256i(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = lasx_cmpeq_i32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
+// v8sf poly_mask = lasx_castm256i_f32(imm2);
+
+  v8sf swap_sign_bit_sin = lasx_castm256i_f32(imm0);
+  v8sf poly_mask = lasx_castm256i_f32(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
+  xmm1 = lasx_mul_f32(y, xmm1);
+  xmm2 = lasx_mul_f32(y, xmm2);
+  xmm3 = lasx_mul_f32(y, xmm3);
+  x = lasx_add_f32(x, xmm1);
+  x = lasx_add_f32(x, xmm2);
+  x = lasx_add_f32(x, xmm3);
+
+  imm4 = lasx_sub_i32(imm4, *(v8si *)_pi32_256_2);     // NOLINT
+  imm4 = lasx_andnot_m256i(imm4, *(v8si *)_pi32_256_4);  // NOLINT
+  imm4 = lasx_slli_i32(imm4, 29);
+
+  v8sf sign_bit_cos = lasx_castm256i_f32(imm4);
+
+  sign_bit_sin = lasx_xor_f32(sign_bit_sin, swap_sign_bit_sin);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v8sf z = lasx_mul_f32(x, x);
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT
+
+  y = lasx_mul_f32(y, z);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
+  y = lasx_mul_f32(y, z);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
+  y = lasx_mul_f32(y, z);
+  y = lasx_mul_f32(y, z);
+  v8sf tmp = lasx_mul_f32(z, *(v8sf *)_ps256_0p5);  // NOLINT
+  y = lasx_sub_f32(y, tmp);
+  y = lasx_add_f32(y, *(v8sf *)_ps256_1);  // NOLINT
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_add_f32(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_add_f32(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
+  y2 = lasx_mul_f32(y2, z);
+  y2 = lasx_mul_f32(y2, x);
+  y2 = lasx_add_f32(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  v8sf ysin2 = lasx_and_f32(xmm3, y2);
+  v8sf ysin1 = lasx_andnot_f32(xmm3, y);
+  y2 = lasx_sub_f32(y2, ysin2);
+  y = lasx_sub_f32(y, ysin1);
+
+  xmm1 = lasx_add_f32(ysin1, ysin2);
+  xmm2 = lasx_add_f32(y, y2);
+
+  /* update the sign */
+  *s = lasx_xor_f32(xmm1, sign_bit_sin);
+  *c = lasx_xor_f32(xmm2, sign_bit_cos);
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lasx/lasx_mathfuns.h b/lite/backends/loongarch/math/lasx/lasx_mathfuns.h
new file mode 100644
index 00000000000..91b858a99a1
--- /dev/null
+++ b/lite/backends/loongarch/math/lasx/lasx_mathfuns.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/xxl.h"
+#include "lite/backends/loongarch/cpu_info.h"
+#include "lite/backends/loongarch/math/lsx/lsx_mathfuns.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+/* __m256 is ugly to write */
+typedef __m256 v8sf;   // vector of 8 float
+typedef __m256i v8si;  // vector of 8 int
+v8sf log256_ps(v8sf x);
+v8sf exp256_ps(v8sf x);
+v8sf pow256_ps(v8sf x, v8sf y);
+v8sf sin256_ps(v8sf x);
+v8sf cos256_ps(v8sf x);
+void sincos256_ps(v8sf x, v8sf *s, v8sf *c);
+
+#define lasx_loadu_epi8(ptr) \
+  lasx_loadu_m256i(reinterpret_cast<__m256i const *>(ptr))
+#define lasx_storeu_epi8(ptr) \
+  lasx_storeu_m256i(reinterpret_cast<__m256i const *>(ptr))
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lstm_compute.cc b/lite/backends/loongarch/math/lstm_compute.cc
new file mode 100644
index 00000000000..4aab29a1a25
--- /dev/null
+++ b/lite/backends/loongarch/math/lstm_compute.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/lstm_compute.h"
+#include "lite/backends/loongarch/math/lstm_cpu_kernel.h"
+#include "lite/backends/loongarch/math/lstm_kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <class T>
+struct LstmUnitFunctor<lite::TargetType::kLoongArch, T> {
+  static void compute(const lite::LoongArchContext& context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_forward(detail::forward::lstm<T>(),
+                               value,
+                               frame_size,
+                               cell_clip,
+                               cand_act,
+                               gate_act,
+                               cell_act);
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<lite::TargetType::kLoongArch, T> {
+  static void compute(const lite::LoongArchContext& context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_backward(detail::backward::lstm<T>(),
+                                value,
+                                grad,
+                                frame_size,
+                                cell_clip,
+                                cand_act,
+                                gate_act,
+                                cell_act);
+
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+
+      grad.gate_grad += frame_size * 4;
+      grad.state_grad += frame_size;
+      grad.state_active_grad += frame_size;
+      grad.output_grad += frame_size;
+      if (grad.prev_state_grad) {
+        grad.prev_state_grad += frame_size;
+      }
+    }
+  }
+};
+
+template class LstmUnitFunctor<lite::TargetType::kLoongArch, float>;
+template class LstmUnitFunctor<lite::TargetType::kLoongArch, double>;
+template class LstmUnitGradFunctor<lite::TargetType::kLoongArch, float>;
+template class LstmUnitGradFunctor<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lstm_compute.h b/lite/backends/loongarch/math/lstm_compute.h
new file mode 100644
index 00000000000..827e512abac
--- /dev/null
+++ b/lite/backends/loongarch/math/lstm_compute.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "lite/backends/loongarch/math/activation_functions.h"
+#include "lite/core/context.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <class T>
+struct LstmMetaValue {
+  T *gate_value;
+  T *prev_state_value;
+  T *state_value;
+  T *state_active_value;
+  T *output_value;
+  T *check_ig;
+  T *check_fg;
+  T *check_og;
+};
+
+template <class T>
+struct LstmMetaGrad {
+  T *gate_grad;
+  T *prev_state_grad;
+  T *state_grad;
+  T *state_active_grad;
+  T *output_grad;
+  T *check_ig_grad;
+  T *check_fg_grad;
+  T *check_og_grad;
+};
+
+template <lite::TargetType Target, typename T>
+class LstmUnitFunctor {
+ public:
+  static void compute(const lite::Context<Target> &context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
+};
+
+template <lite::TargetType Target, typename T>
+class LstmUnitGradFunctor {
+ public:
+  static void compute(const lite::Context<Target> &context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lstm_cpu_kernel.h b/lite/backends/loongarch/math/lstm_cpu_kernel.h
new file mode 100644
index 00000000000..552eca059f2
--- /dev/null
+++ b/lite/backends/loongarch/math/lstm_cpu_kernel.h
@@ -0,0 +1,421 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "lite/backends/loongarch/math/activation_functions.h"
+#include "lite/backends/loongarch/math/lstm_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+namespace detail {
+
+template <class T, class Op>
+void naive_lstm_forward_one_sequence(Op op,
+                                     LstmMetaValue<T> value,
+                                     int frame_size,
+                                     T cell_clip,
+                                     ActivationType active_node,
+                                     ActivationType active_gate,
+                                     ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
+    }
+
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_prev_state,
+       &r_state,
+       &r_state_atv,
+       &r_out,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    value.state_value[i] = r_state;
+    value.state_active_value[i] = r_state_atv;
+    value.output_value[i] = r_out;
+  }
+}
+
+template <class T, class Op>
+void naive_lstm_backward_one_sequence(Op op,
+                                      LstmMetaValue<T> value,
+                                      LstmMetaGrad<T> grad,
+                                      int frame_size,
+                                      T cell_clip,
+                                      ActivationType active_node,
+                                      ActivationType active_gate,
+                                      ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+  T *grad_in = grad.gate_grad;
+  T *grad_ig = grad.gate_grad + frame_size;
+  T *grad_fg = grad.gate_grad + frame_size * 2;
+  T *grad_og = grad.gate_grad + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+    r_state = value.state_value[i];
+    r_state_atv = value.state_active_value[i];
+    r_output_grad = grad.output_grad[i];
+    r_state_grad = grad.state_grad[i];
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
+    }
+
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_grad_in,
+       &r_grad_ig,
+       &r_grad_fg,
+       &r_grad_og,
+       &r_prev_state,
+       &r_prev_state_grad,
+       &r_state,
+       &r_state_grad,
+       &r_state_atv,
+       &r_output_grad,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &r_checkIGrad,
+       &r_checkFGrad,
+       &r_checkOGrad,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    grad.state_grad[i] = r_state_grad;
+
+    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
+    }
+    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
+  }
+}
+
+template <class T, class Op>
+void lasx_lstm_forward_one_sequence(Op op,
+                                    LstmMetaValue<T> value,
+                                    int frame_size,
+                                    T cell_clip,
+                                    ActivationType active_node,
+                                    ActivationType active_gate,
+                                    ActivationType active_state) {
+#ifdef __loongarch_asx
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_checkI = lasx_set1_f32(0.0f);
+  __m256 r_checkF = lasx_set1_f32(0.0f);
+  __m256 r_checkO = lasx_set1_f32(0.0f);
+  __m256 r_state;
+  __m256 r_prev_state = lasx_set1_f32(0.0f);
+  __m256 r_state_atv;
+  __m256 r_out;
+
+  __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
+  __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
+  __m256 *value_fg =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
+  __m256 *value_og =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
+      r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
+      r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
+    }
+
+    if (value.prev_state_value) {
+      r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
+    }
+
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_prev_state,
+       &r_state,
+       &r_state_atv,
+       &r_out,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state;
+    (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv;
+    (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out;
+  }
+#endif
+}
+
+template <class T, class Op>
+void lasx_lstm_backward_one_sequence(Op op,
+                                     LstmMetaValue<T> value,
+                                     LstmMetaGrad<T> grad,
+                                     int frame_size,
+                                     T cell_clip,
+                                     ActivationType active_node,
+                                     ActivationType active_gate,
+                                     ActivationType active_state) {
+#ifdef __loongarch_asx
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_grad_in;
+  __m256 r_grad_ig;
+  __m256 r_grad_fg;
+  __m256 r_grad_og;
+  __m256 r_prev_state = lasx_set1_f32(0.0f);
+  __m256 r_prev_state_grad;
+  __m256 r_state_grad;
+  __m256 r_state;
+  __m256 r_state_atv;
+  __m256 r_output_grad;
+  __m256 r_checkI = lasx_set1_f32(0.0f);
+  __m256 r_checkF = lasx_set1_f32(0.0f);
+  __m256 r_checkO = lasx_set1_f32(0.0f);
+  __m256 r_checkIGrad;
+  __m256 r_checkFGrad;
+  __m256 r_checkOGrad;
+
+  __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
+  __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
+  __m256 *value_fg =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
+  __m256 *value_og =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
+  __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad);
+  __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size);
+  __m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2);
+  __m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
+      r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
+      r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
+    }
+    r_state = (reinterpret_cast<__m256 *>(value.state_value))[i];
+    r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i];
+    r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i];
+    r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i];
+    if (value.prev_state_value) {
+      r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
+    }
+
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_grad_in,
+       &r_grad_ig,
+       &r_grad_fg,
+       &r_grad_og,
+       &r_prev_state,
+       &r_prev_state_grad,
+       &r_state,
+       &r_state_grad,
+       &r_state_atv,
+       &r_output_grad,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &r_checkIGrad,
+       &r_checkFGrad,
+       &r_checkOGrad,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad;
+
+    if (grad.prev_state_grad)
+      (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad)
+        (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad;
+      if (grad.check_fg_grad)
+        (reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad;
+    }
+    if (grad.check_og_grad)
+      (reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad;
+  }
+#endif
+}
+
+template <class T, class Op>
+void cpu_lstm_forward(Op op,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      T cell_clip,
+                      ActivationType active_node,
+                      ActivationType active_gate,
+                      ActivationType active_state) {
+  if (Op::lasx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    lasx_lstm_forward_one_sequence<T>(op,
+                                      value,
+                                      frame_size,
+                                      cell_clip,
+                                      active_node,
+                                      active_gate,
+                                      active_state);
+  } else {
+    naive_lstm_forward_one_sequence<T>(op,
+                                       value,
+                                       frame_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
+  }
+}
+
+template <class T, class Op>
+void cpu_lstm_backward(Op op,
+                       LstmMetaValue<T> value,
+                       LstmMetaGrad<T> grad,
+                       int frame_size,
+                       T cell_clip,
+                       ActivationType active_node,
+                       ActivationType active_gate,
+                       ActivationType active_state) {
+  if (Op::lasx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    lasx_lstm_backward_one_sequence<T>(op,
+                                       value,
+                                       grad,
+                                       frame_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
+  } else {
+    naive_lstm_backward_one_sequence<T>(op,
+                                        value,
+                                        grad,
+                                        frame_size,
+                                        cell_clip,
+                                        active_node,
+                                        active_gate,
+                                        active_state);
+  }
+}
+
+}  // namespace detail
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lstm_kernel.h b/lite/backends/loongarch/math/lstm_kernel.h
new file mode 100644
index 00000000000..59ef9f07d6d
--- /dev/null
+++ b/lite/backends/loongarch/math/lstm_kernel.h
@@ -0,0 +1,231 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "lite/backends/loongarch/math/activation_functions.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T *value_in,
+                             T *value_ig,
+                             T *value_fg,
+                             T *value_og,
+                             T *prev_state,
+                             T *state,
+                             T *state_atv,
+                             T *output,
+                             T *checkI,
+                             T *checkF,
+                             T *checkO,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *value_in = activation(*value_in, active_node);
+    *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
+    *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
+    *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
+
+    if (*cell_clip > 0.0) {
+      if (*state < -1.0 * (*cell_clip)) {
+        *state = -1.0 * (*cell_clip);
+      }
+      if (*state > *cell_clip) {
+        *state = *cell_clip;
+      }
+    }
+    *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
+    *state_atv = activation(*state, active_state);
+    *output = (*value_og) * (*state_atv);
+  }
+#ifndef __loongarch_asx
+  static const bool lasx = false;
+#else
+  static const bool lasx = std::is_same<T, float>::value;
+
+  HOSTDEVICE void operator()(__m256 *value_in,
+                             __m256 *value_ig,
+                             __m256 *value_fg,
+                             __m256 *value_og,
+                             __m256 *prev_state,
+                             __m256 *state,
+                             __m256 *state_atv,
+                             __m256 *output,
+                             __m256 *checkI,
+                             __m256 *checkF,
+                             __m256 *checkO,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *value_in = activation(*value_in, active_node);
+    *value_ig = activation(
+        lasx_add_f32(*value_ig, lasx_mul_f32(*prev_state, *checkI)),
+        active_gate);
+    *value_fg = activation(
+        lasx_add_f32(*value_fg, lasx_mul_f32(*prev_state, *checkF)),
+        active_gate);
+    *state = lasx_add_f32(lasx_mul_f32(*value_in, *value_ig),
+                           lasx_mul_f32(*prev_state, *value_fg));
+
+    if (*cell_clip > 0.0f) {
+      __m256 min = lasx_set1_f32(0.0f - *cell_clip);
+      __m256 max = lasx_set1_f32(*cell_clip);
+      *state = lasx_min_f32(max, *state);
+      *state = lasx_max_f32(min, *state);
+    }
+    *value_og = activation(
+        lasx_add_f32(*value_og, lasx_mul_f32(*state, *checkO)), active_gate);
+    *state_atv = activation(*state, active_state);
+    *output = lasx_mul_f32(*value_og, *state_atv);
+  }
+#endif
+};
+
+}  // namespace forward
+
+namespace backward {
+
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T *value_in,
+                             T *value_ig,
+                             T *value_fg,
+                             T *value_og,
+                             T *grad_in,
+                             T *grad_ig,
+                             T *grad_fg,
+                             T *grad_og,
+                             T *prev_state,
+                             T *prev_state_grad,
+                             T *state,
+                             T *state_grad,
+                             T *state_atv,
+                             T *output_grad,
+                             T *checkI,
+                             T *checkF,
+                             T *checkO,
+                             T *checkIGrad,
+                             T *checkFGrad,
+                             T *checkOGrad,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *grad_og =
+        activation((*output_grad) * (*state_atv), *value_og, active_gate);
+    if (*cell_clip > 0.0f) {
+      if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
+        *state_grad = 0.0f;
+      } else {
+        *state_grad +=
+            activation((*output_grad) * (*value_og), *state_atv, active_state) +
+            (*grad_og) * (*checkO);
+      }
+    } else {
+      *state_grad +=
+          activation((*output_grad) * (*value_og), *state_atv, active_state) +
+          (*grad_og) * (*checkO);
+    }
+
+    *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
+    *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
+    *grad_fg =
+        activation((*state_grad) * (*prev_state), *value_fg, active_gate);
+    *prev_state_grad = (*grad_ig) * (*checkI) + (*grad_fg) * (*checkF) +
+                       (*state_grad) * (*value_fg);
+    *checkIGrad = (*grad_ig) * (*prev_state);
+    *checkFGrad = (*grad_fg) * (*prev_state);
+    *checkOGrad = (*grad_og) * (*state);
+  }
+
+#ifndef __loongarch_asx
+  static const bool lasx = false;
+#else
+  static const bool lasx = std::is_same<T, float>::value;
+  HOSTDEVICE void operator()(__m256 *value_in,
+                             __m256 *value_ig,
+                             __m256 *value_fg,
+                             __m256 *value_og,
+                             __m256 *grad_in,
+                             __m256 *grad_ig,
+                             __m256 *grad_fg,
+                             __m256 *grad_og,
+                             __m256 *prev_state,
+                             __m256 *prev_state_grad,
+                             __m256 *state,
+                             __m256 *state_grad,
+                             __m256 *state_atv,
+                             __m256 *output_grad,
+                             __m256 *checkI,
+                             __m256 *checkF,
+                             __m256 *checkO,
+                             __m256 *checkIGrad,
+                             __m256 *checkFGrad,
+                             __m256 *checkOGrad,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *grad_og = activation(
+        lasx_mul_f32(*output_grad, *state_atv), *value_og, active_gate);
+    if (*cell_clip > 0.0f) {
+      T *state_ = reinterpret_cast<T *>(state);
+      if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
+        *state_grad = lasx_set1_f32(0.0f);
+      } else {
+        *state_grad =
+            lasx_add_f32(activation(lasx_mul_f32(*output_grad, *value_og),
+                                     *state_atv,
+                                     active_state),
+                          *state_grad);
+        *state_grad =
+            lasx_add_f32(lasx_mul_f32(*grad_og, *checkO), *state_grad);
+      }
+    }
+    *grad_in = activation(
+        lasx_mul_f32(*state_grad, *value_ig), *value_in, active_node);
+    *grad_ig = activation(
+        lasx_mul_f32(*state_grad, *value_in), *value_ig, active_gate);
+    *grad_fg = activation(
+        lasx_mul_f32(*state_grad, *prev_state), *value_fg, active_gate);
+    *prev_state_grad = lasx_add_f32(lasx_mul_f32(*grad_ig, *checkI),
+                                     lasx_mul_f32(*grad_fg, *checkF));
+    *prev_state_grad =
+        lasx_add_f32(lasx_mul_f32(*state_grad, *value_fg), *prev_state_grad);
+    *checkIGrad = lasx_mul_f32(*grad_ig, *prev_state);
+    *checkFGrad = lasx_mul_f32(*grad_fg, *prev_state);
+    *checkOGrad = lasx_mul_f32(*grad_og, *state);
+  }
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lsx/conv_depthwise_pack4.cc b/lite/backends/loongarch/math/lsx/conv_depthwise_pack4.cc
new file mode 100644
index 00000000000..3e9399aba01
--- /dev/null
+++ b/lite/backends/loongarch/math/lsx/conv_depthwise_pack4.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/common/conv_utils.h"
+#include "lite/backends/loongarch/math/lsx/conv_depthwise_pack4.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void conv_depthwise_m128(lite::Tensor* input,
+                         lite::Tensor* output,
+                         lite::Tensor* filter,
+                         lite::Tensor* bias,
+                         const int stride_h,
+                         const int stride_w,
+                         const int dilation_h,
+                         const int dilation_w,
+                         const bool has_act,
+                         const lite_api::ActivationType act_type,
+                         const operators::ActivationParam act_param) {
+  // input [bs, ic/8, ih, iw, 8]
+  CHECK_EQ(input->dims().size(), 5UL);
+  const int batch_size = input->dims()[0];
+  const int channel_num = input->dims()[1];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const float* input_data = input->data<float>();
+
+  // filter [1, oc/8, kh, kw, 8]
+  CHECK_EQ(filter->dims().size(), 5UL);
+  const int kernel_h = filter->dims()[2];
+  const int kernel_w = filter->dims()[3];
+  const float* filter_data = filter->data<float>();
+
+  // output [bs, oc/8, oh, ow, 8]
+  CHECK_EQ(output->dims().size(), 5UL);
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  float* output_data = output->mutable_data<float>();
+
+  const int input_group_step = input_width * 4;
+  const int input_channel_step = input_height * input_width * 4;
+  const int input_batch_step = channel_num * input_height * input_width * 4;
+
+  const int filter_kernel_size = kernel_h * kernel_w;
+  const int filter_channel_step = kernel_h * kernel_w * 4;
+
+  // kernel offsets
+  std::vector<int> _space_ofs(filter_kernel_size);
+  int* space_ofs = &_space_ofs[0];
+  {
+    int p1 = 0;
+    int p2 = 0;
+    int gap = input_width * dilation_h - kernel_w * dilation_w;
+    for (int i = 0; i < kernel_h; i++) {
+      for (int j = 0; j < kernel_w; j++) {
+        space_ofs[p1++] = p2;
+        p2 += dilation_w;
+      }
+      p2 += gap;
+    }
+  }
+
+  for (int bs = 0; bs < batch_size; ++bs) {
+    for (int ic = 0; ic < channel_num; ++ic) {
+      const float* input_ptr =
+          input_data + bs * input_batch_step + ic * input_channel_step;
+      const float* filter_ptr = filter_data + ic * filter_channel_step;
+      for (int i = 0; i < output_height; ++i) {
+        for (int j = 0; j < output_width; ++j) {
+          __m128 _sum = lsx_set1_f32(0.f);
+
+          if (bias) {
+            _sum = lsx_loadu_f32((bias->data<float>()) + ic * 4);
+          }
+
+          const float* start_ptr =
+              input_ptr + i * stride_h * input_group_step + j * stride_w * 4;
+
+          for (int k = 0; k < filter_kernel_size; k++) {
+            __m128 _input = lsx_loadu_f32(start_ptr + space_ofs[k] * 4);
+            __m128 _filter = lsx_loadu_f32(filter_ptr + k * 4);
+            __m128 _mul = lsx_mul_f32(_input, _filter);
+            _sum = lsx_add_f32(_mul, _sum);
+          }
+
+          if (has_act) {
+            _sum = activation4_m128(_sum, act_type, act_param);
+          }
+
+          lsx_storeu_f32(output_data, _sum);
+          output_data += 4;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lsx/conv_depthwise_pack4.h b/lite/backends/loongarch/math/lsx/conv_depthwise_pack4.h
new file mode 100644
index 00000000000..8e992453a25
--- /dev/null
+++ b/lite/backends/loongarch/math/lsx/conv_depthwise_pack4.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void conv_depthwise_m128(lite::Tensor* input,
+                         lite::Tensor* output,
+                         lite::Tensor* filter,
+                         lite::Tensor* bias,
+                         const int stride_h,
+                         const int stride_w,
+                         const int dilation_h,
+                         const int dilation_w,
+                         const bool has_act,
+                         const lite_api::ActivationType act_type,
+                         const operators::ActivationParam act_param);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/lsx/lsx_mathfuns.h b/lite/backends/loongarch/math/lsx/lsx_mathfuns.h
new file mode 100644
index 00000000000..97dde8178b7
--- /dev/null
+++ b/lite/backends/loongarch/math/lsx/lsx_mathfuns.h
@@ -0,0 +1,35 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/xxl.h"
+#include "lite/backends/loongarch/cpu_info.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+/* __m128 is ugly to write */
+typedef __m128i v4si;  // vector of 8 int
+
+#define lsx_loadu_epi8(ptr) \
+  lsx_loadu_m128i(reinterpret_cast<__m128i const *>(ptr))
+
+#define lsx_storeu_epi8(ptr) \
+  lsx_storeu_m128i(reinterpret_cast<__m128i const *>(ptr))
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/math_function.cc b/lite/backends/loongarch/math/math_function.cc
new file mode 100644
index 00000000000..fa1454a8ff8
--- /dev/null
+++ b/lite/backends/loongarch/math/math_function.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/math_function.h"
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+#include <vector>
+#include "lite/backends/loongarch/fluid/data_type.h"
+#include "lite/backends/loongarch/fluid/float16.h"
+#include "lite/backends/loongarch/math/math_function_impl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template struct SetConstant<lite::TargetType::kLoongArch, lite::fluid::float16>;
+template struct SetConstant<lite::TargetType::kLoongArch, float>;
+template struct SetConstant<lite::TargetType::kLoongArch, double>;
+template struct SetConstant<lite::TargetType::kLoongArch, int>;
+template struct SetConstant<lite::TargetType::kLoongArch, int64_t>;
+template struct SetConstant<lite::TargetType::kLoongArch, bool>;
+template struct SetConstant<lite::TargetType::kLoongArch, uint8_t>;
+
+#define DEFINE_CPU_TRANS(RANK)                                      \
+  template struct Transpose<lite::TargetType::kLoongArch,                 \
+                            lite::fluid::float16,                   \
+                            RANK>;                                  \
+  template struct Transpose<lite::TargetType::kLoongArch, float, RANK>;   \
+  template struct Transpose<lite::TargetType::kLoongArch, double, RANK>;  \
+  template struct Transpose<lite::TargetType::kLoongArch, int, RANK>;     \
+  template struct Transpose<lite::TargetType::kLoongArch, int64_t, RANK>; \
+  template struct Transpose<lite::TargetType::kLoongArch, bool, RANK>;    \
+  template struct Transpose<lite::TargetType::kLoongArch, int16_t, RANK>; \
+  template struct Transpose<lite::TargetType::kLoongArch, uint8_t, RANK>; \
+  template struct Transpose<lite::TargetType::kLoongArch, int8_t, RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(lite::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    auto* begin = tensor_->template mutable_data<T>(lite::TargetType::kLoongArch);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  lite::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<lite::TargetType::kLoongArch>(
+    const lite::Context<lite::TargetType::kLoongArch>& context,
+    lite::Tensor* tensor,
+    float value) {
+  // lite::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
+  TensorSetConstantCPU(tensor, value).apply<float>();
+}
+
+template <lite::TargetType Target>
+struct TensorSetConstantWithTarget /*: public boost::static_visitor<void>*/ {
+  TensorSetConstantWithTarget(const lite::Context<Target>& context,
+                              lite::Tensor* tensor,
+                              float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  void operator()() const {
+    set_constant_with_place<Target>(context_, tensor_, value_);
+  }
+
+  const lite::Context<Target>& context_;
+  lite::Tensor* tensor_;
+  float value_;
+};
+
+template <lite::TargetType Target>
+void set_constant(const lite::Context<Target>& context,
+                  lite::Tensor* tensor,
+                  float value) {
+  TensorSetConstantWithTarget<Target> func(context, tensor, value);
+  func();
+}
+
+template <typename T>
+struct RowwiseAdd<lite::TargetType::kLoongArch, T> {
+  void operator()(const lite::Context<lite::TargetType::kLoongArch>& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& vector,
+                  lite::Tensor* output) {
+    const auto& in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    CHECK_EQ(vector.numel(), size);
+    CHECK_EQ(output->dims(), in_dims);
+
+    const T* input_data = input.data<T>();
+    const T* vector_data = vector.data<T>();
+    T* output_data = output->template mutable_data<T>();
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      for (int64_t j = 0; j < size; ++j) {
+        output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
+      }
+    }
+  }
+};
+
+template struct RowwiseAdd<lite::TargetType::kLoongArch, float>;
+template struct RowwiseAdd<lite::TargetType::kLoongArch, double>;
+
+template struct ColwiseSum<lite::TargetType::kLoongArch, float>;
+template struct ColwiseSum<lite::TargetType::kLoongArch, double>;
+template struct ColwiseSum<lite::TargetType::kLoongArch, int>;
+template struct ColwiseSum<lite::TargetType::kLoongArch, int64_t>;
+
+template struct RowwiseSum<lite::TargetType::kLoongArch, float>;
+template struct RowwiseSum<lite::TargetType::kLoongArch, double>;
+
+template struct RowwiseMean<lite::TargetType::kLoongArch, float>;
+template struct RowwiseMean<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/math_function.h b/lite/backends/loongarch/math/math_function.h
new file mode 100644
index 00000000000..f317f641bb7
--- /dev/null
+++ b/lite/backends/loongarch/math/math_function.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <vector>
+
+#include "lite/backends/loongarch/fluid/float16.h"
+#include "lite/core/context.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/log/cp_logging.h"
+// #include "lite/tensor_util.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+// template <typename T, int Rank>
+//    struct Transpose {
+//        void operator()(const lite::Context<Target::kLoongArch> &context)
+//    };
+
+template <lite::TargetType Target, typename T, int Rank>
+struct Transpose {
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& in,
+                  lite::Tensor* out,
+                  const std::vector<int>& axis);
+};
+
+template <lite::TargetType Target, typename T>
+struct SetConstant {
+  void operator()(const lite::Context<Target>& context,
+                  lite::Tensor* tensor,
+                  T num);
+};
+
+template <lite::TargetType Target>
+void set_constant_with_place(const lite::Context<Target>& context,
+                             lite::Tensor* tensor,
+                             float value);
+
+template <lite::TargetType Target>
+void set_constant(const lite::Context<Target>& context,
+                  lite::Tensor* tensor,
+                  float value);
+
+template <lite::TargetType Target, typename T>
+struct RowwiseAdd {
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& vec,
+                  lite::Tensor* output);
+};
+
+template <lite::TargetType Target, typename T>
+struct ColwiseSum {
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  lite::Tensor* vec);
+};
+
+template <lite::TargetType Target, typename T>
+struct RowwiseSum {
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  lite::Tensor* vec);
+};
+
+template <lite::TargetType Target, typename T>
+struct RowwiseMean {
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  lite::Tensor* vec);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/math_function_impl.h b/lite/backends/loongarch/math/math_function_impl.h
new file mode 100644
index 00000000000..1209feee0f8
--- /dev/null
+++ b/lite/backends/loongarch/math/math_function_impl.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/loongarch/fluid/data_type.h"
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <lite::TargetType Target, typename T>
+void SetConstant<Target, T>::operator()(const lite::Context<Target>& context,
+                                        lite::Tensor* tensor,
+                                        T num) {
+  auto t = lite::fluid::EigenVector<T>::Flatten(*tensor);
+
+  // t.device(*Eigen::DefaultDevice()) = t.constant(static_cast<T>(num));
+  // t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
+  t.device(typename lite::fluid::EigenDevice<Target>::Type()) =
+      t.constant(static_cast<T>(num));
+}
+
+template <lite::TargetType Target, typename T, int Rank>
+void Transpose<Target, T, Rank>::operator()(
+    const lite::Context<Target>& context,
+    const lite::TensorLite& in,
+    lite::TensorLite* out,
+    const std::vector<int>& axis) {
+  Eigen::array<int, Rank> permute;
+  for (int i = 0; i < Rank; i++) {
+    permute[i] = axis[i];
+  }
+  auto eigen_in = lite::fluid::EigenTensor<T, Rank>::From(in);
+  auto eigen_out = lite::fluid::EigenTensor<T, Rank>::From(*out);
+  // auto* dev = context.eigen_device();
+  // eigen_out.device(*dev) = eigen_in.shuffle(permute);
+  eigen_out.device(typename lite::fluid::EigenDevice<Target>::Type()) =
+      eigen_in.shuffle(permute);
+}
+
+template <lite::TargetType Target, typename T>
+void ColwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
+                                       const lite::TensorLite& input,
+                                       lite::TensorLite* out) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  CHECK_EQ(out->numel(), size);
+
+  auto in = lite::fluid::EigenMatrix<T>::From(input);
+  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
+
+  // vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
+  vec.device(typename lite::fluid::EigenDevice<Target>::Type()) =
+      in.sum(Eigen::array<int, 1>({{0}}));
+}
+
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::TensorLite& input,
+                  lite::TensorLite* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    CHECK_EQ(out->numel(), size);
+
+    T* out_buf = out->template mutable_data<T>(out->target());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
+
+template <lite::TargetType Target, typename T>
+void RowwiseMean<Target, T>::operator()(const lite::Context<Target>& context,
+                                        const lite::TensorLite& input,
+                                        lite::TensorLite* out) {
+  auto in_dims = input.dims();
+  CHECK_EQ(in_dims.size(), 2U);
+  CHECK_EQ(out->numel(), in_dims[0]);
+
+  auto in = lite::fluid::EigenMatrix<T>::From(input);
+  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
+
+  // vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
+  vec.device(typename lite::fluid::EigenDevice<Target>::Type()) =
+      in.mean(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseMean<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::TensorLite& input,
+                  lite::TensorLite* out) {
+    auto& in_dims = input.dims();
+    CHECK_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    CHECK_EQ(out->numel(), height);
+    auto inv_size = 1.0 / size;
+    T* out_buf = out->template mutable_data<T>(out->target());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum * inv_size;
+    }
+  }
+};
+
+template <lite::TargetType Target, typename T>
+void RowwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
+                                       const lite::TensorLite& input,
+                                       lite::TensorLite* out) {
+  auto in_dims = input.dims();
+  CHECK_EQ(in_dims.size(), 2U);
+  CHECK_EQ(out->numel(), in_dims[0]);
+
+  auto in = lite::fluid::EigenMatrix<T>::From(input);
+  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
+
+  // vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
+  vec.device(typename lite::fluid::EigenDevice<Target>::Type()) =
+      in.sum(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseSum<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::TensorLite& input,
+                  lite::TensorLite* out) {
+    auto& in_dims = input.dims();
+    CHECK_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    CHECK_EQ(out->numel(), height);
+
+    T* out_buf = out->template mutable_data<T>(out->target());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum;
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/maxouting.cc b/lite/backends/loongarch/math/maxouting.cc
new file mode 100644
index 00000000000..6d669ed9597
--- /dev/null
+++ b/lite/backends/loongarch/math/maxouting.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/maxouting.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+// All tensors are in NCHW format, and the groups must be greater than 1
+template <typename T>
+class MaxOutFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  lite::Tensor* output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    int fea_size = input_height * input_width;
+    // c_size means the output size of each sample
+    int c_size = fea_size * output_channels;
+    const T* input_data = input.data<T>();
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    for (int i = 0; i < batch_size; ++i) {
+      int new_bindex = c_size * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int new_cindex = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          T ele = static_cast<T>(-FLT_MAX);
+          for (int ph = 0; ph < groups; ++ph) {
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
+            ele = ele > x ? ele : x;
+          }
+          output_data[(new_bindex + new_cindex + f)] = ele;
+        }
+      }
+    }
+  }
+};
+
+template <class T>
+class MaxOutGradFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  lite::Tensor* input_grad,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    int fea_size = input_height * input_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    for (int i = 0; i < batch_size; ++i) {
+      int blen = fea_size * output_channels * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int clen = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          int input_idx0 = (blen + clen) * groups + f;
+          bool continue_match = true;
+          int output_idx = blen + clen + f;
+          for (int g = 0; g < groups && continue_match; ++g) {
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class MaxOutGradFunctor<lite::TargetType::kLoongArch, float>;
+template class MaxOutGradFunctor<lite::TargetType::kLoongArch, double>;
+template class MaxOutFunctor<lite::TargetType::kLoongArch, float>;
+template class MaxOutFunctor<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/maxouting.h b/lite/backends/loongarch/math/maxouting.h
new file mode 100644
index 00000000000..aedfaea7b67
--- /dev/null
+++ b/lite/backends/loongarch/math/maxouting.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <lite::TargetType Target, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  lite::Tensor* output,
+                  int groups);
+};
+
+template <lite::TargetType Target, class T>
+class MaxOutGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  lite::Tensor* input_grad,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  int groups);
+};
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/pooling.cc b/lite/backends/loongarch/math/pooling.cc
new file mode 100644
index 00000000000..299329cd8cc
--- /dev/null
+++ b/lite/backends/loongarch/math/pooling.cc
@@ -0,0 +1,918 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/pooling.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<lite::TargetType::kLoongArch, PoolProcess, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor* input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  PoolProcess pool_process,
+                  bool exclusive,
+                  bool adaptive,
+                  lite::Tensor* output) {
+    const int batch_size = input->dims()[0];
+    const int input_height = input->dims()[2];
+    const int input_width = input->dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[2];
+
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input->template data<T>();
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    int hstart, hend;
+    int wstart, wend;
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          }
+          for (int pw = 0; pw < output_width; ++pw) {
+            int pool_size = 1;
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              wstart = pw * stride_width - padding_width;
+              hend = std::min(hstart + ksize_height,
+                              input_height + padding_height);
+              wend =
+                  std::min(wstart + ksize_width, input_width + padding_width);
+              pool_size = (hend - hstart) * (wend - wstart);
+
+              wstart = std::max(wstart, 0);
+              hstart = std::max(hstart, 0);
+              hend = std::min(hend, input_height);
+              wend = std::min(wend, input_width);
+            }
+
+            T ele = pool_process.initial();
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_process.compute(input_data[h * input_width + w], &ele);
+              }
+            }
+            if (exclusive || adaptive) {
+              pool_size = (hend - hstart) * (wend - wstart);
+            }
+
+            pool_process.finalize(static_cast<T>(pool_size), &ele);
+            output_data[ph * output_width + pw] = ele;
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+* All tensors are in NCHW format.
+* Ksize, strides, paddings are two elements. These two elements represent height
+* and width, respectively.
+*/
+template <typename PoolProcess, class T>
+class Pool2dGradFunctor<lite::TargetType::kLoongArch, PoolProcess, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  PoolProcess pool_grad_process,
+                  bool exclusive,
+                  bool adaptive,
+                  lite::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[2];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    int hstart, hend;
+    int wstart, wend;
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          } else {
+            hstart = ph * stride_height - padding_height;
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
+          }
+          for (int pw = 0; pw < output_width; ++pw) {
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              wstart = pw * stride_width - padding_width;
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
+            }
+            int pool_size = (exclusive || adaptive)
+                                ? (hend - hstart) * (wend - wstart)
+                                : ksize_height * ksize_width;
+            float scale = 1.0 / pool_size;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_grad_process.compute(
+                    input_data[h * input_width + w],
+                    output_data[ph * output_width + pw],
+                    output_grad_data[ph * output_width + pw],
+                    static_cast<T>(scale),
+                    input_grad_data + h * input_width + w);
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <class T>
+class MaxPool2dGradFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  lite::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[2];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = (std::min)(hstart + ksize_height, input_height);
+          hstart = (std::max)(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = (std::min)(wstart + ksize_width, input_width);
+            wstart = (std::max)(wstart, 0);
+
+            bool stop = false;
+            for (int h = hstart; h < hend && !stop; ++h) {
+              for (int w = wstart; w < wend && !stop; ++w) {
+                int input_idx = h * input_width + w;
+                int output_idx = ph * output_width + pw;
+                if (input_data[input_idx] == output_data[output_idx]) {
+                  input_grad_data[input_idx] += output_grad_data[output_idx];
+                  stop = true;
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dGradFunctor<lite::TargetType::kLoongArch, float>;
+template class MaxPool2dGradFunctor<lite::TargetType::kLoongArch, double>;
+
+template class Pool2dFunctor<lite::TargetType::kLoongArch,
+                             lite::loongarch::math::MaxPool<float>,
+                             float>;
+template class Pool2dFunctor<lite::TargetType::kLoongArch,
+                             lite::loongarch::math::AvgPool<float>,
+                             float>;
+template class Pool2dGradFunctor<lite::TargetType::kLoongArch,
+                                 lite::loongarch::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<lite::TargetType::kLoongArch,
+                                 lite::loongarch::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<lite::TargetType::kLoongArch,
+                             lite::loongarch::math::MaxPool<double>,
+                             double>;
+template class Pool2dFunctor<lite::TargetType::kLoongArch,
+                             lite::loongarch::math::AvgPool<double>,
+                             double>;
+template class Pool2dGradFunctor<lite::TargetType::kLoongArch,
+                                 lite::loongarch::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<lite::TargetType::kLoongArch,
+                                 lite::loongarch::math::AvgPoolGrad<double>,
+                                 double>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dFunctor<lite::TargetType::kLoongArch, PoolProcess, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  PoolProcess pool_process,
+                  bool exclusive,
+                  bool adaptive,
+                  lite::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
+          }
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
+              }
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = pool_process.initial();
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    pool_process.compute(
+                        input_data[(d * input_height + h) * input_width + w],
+                        &ele);
+                  }
+                }
+              }
+              int pool_size =
+                  (exclusive || adaptive)
+                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                      : ksize_depth * ksize_height * ksize_width;
+              pool_process.finalize(static_cast<T>(pool_size), &ele);
+              output_data[output_idx] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<lite::TargetType::kLoongArch, PoolProcess, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  PoolProcess pool_grad_process,
+                  bool exclusive,
+                  bool adaptive,
+                  lite::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
+          }
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
+              }
+
+              int pool_size =
+                  (exclusive || adaptive)
+                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                      : ksize_depth * ksize_height * ksize_width;
+              float scale = 1.0 / pool_size;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+                    pool_grad_process.compute(input_data[input_idx],
+                                              output_data[output_idx],
+                                              output_grad_data[output_idx],
+                                              static_cast<T>(scale),
+                                              input_grad_data + input_idx);
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <class T>
+class MaxPool3dGradFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  lite::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = (std::min)(dstart + ksize_depth, input_depth);
+          dstart = (std::max)(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
+              bool stop = false;
+              for (int d = dstart; d < dend && !stop; ++d) {
+                for (int h = hstart; h < hend && !stop; ++h) {
+                  for (int w = wstart; w < wend && !stop; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+
+                    if (input_data[input_idx] == output_data[output_idx]) {
+                      input_grad_data[input_idx] +=
+                          output_grad_data[output_idx];
+                      stop = true;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dGradFunctor<lite::TargetType::kLoongArch, float>;
+template class MaxPool3dGradFunctor<lite::TargetType::kLoongArch, double>;
+
+template class Pool3dFunctor<lite::TargetType::kLoongArch,
+                             lite::loongarch::math::MaxPool<float>,
+                             float>;
+template class Pool3dFunctor<lite::TargetType::kLoongArch,
+                             lite::loongarch::math::AvgPool<float>,
+                             float>;
+template class Pool3dGradFunctor<lite::TargetType::kLoongArch,
+                                 lite::loongarch::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<lite::TargetType::kLoongArch,
+                                 lite::loongarch::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<lite::TargetType::kLoongArch,
+                             lite::loongarch::math::MaxPool<double>,
+                             double>;
+template class Pool3dFunctor<lite::TargetType::kLoongArch,
+                             lite::loongarch::math::AvgPool<double>,
+                             double>;
+template class Pool3dGradFunctor<lite::TargetType::kLoongArch,
+                                 lite::loongarch::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<lite::TargetType::kLoongArch,
+                                 lite::loongarch::math::AvgPoolGrad<double>,
+                                 double>;
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<lite::TargetType::kLoongArch, T1, T2> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  lite::Tensor* output,
+                  lite::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[2];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(lite::TargetType::kLoongArch);
+    T2* mask_data = mask->mutable_data<T2>(lite::TargetType::kLoongArch);
+
+    int hstart, hend;
+    int wstart, wend;
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          } else {
+            hstart = ph * stride_height - padding_height;
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
+          }
+          for (int pw = 0; pw < output_width; ++pw) {
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              wstart = pw * stride_width - padding_width;
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
+            }
+
+            T1 ele = static_cast<T1>(-FLT_MAX);
+            int index = -1;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (ele < input_data[h * input_width + w]) {
+                  ele = input_data[h * input_width + w];
+                  index = h * input_width + w;
+                }
+              }
+            }
+            output_data[ph * output_width + pw] = ele;
+            mask_data[ph * output_width + pw] = index;
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<lite::TargetType::kLoongArch, T1, T2> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& output_grad,
+                  const lite::Tensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  lite::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(lite::TargetType::kLoongArch);
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          for (int pw = 0; pw < output_width; ++pw) {
+            const int output_idx = ph * output_width + pw;
+            const int input_idx = static_cast<int>(mask_data[output_idx]);
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dWithIndexFunctor<lite::TargetType::kLoongArch, float, int>;
+template class MaxPool2dWithIndexGradFunctor<lite::TargetType::kLoongArch,
+                                             float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<lite::TargetType::kLoongArch, double, int>;
+template class MaxPool2dWithIndexGradFunctor<lite::TargetType::kLoongArch,
+                                             double,
+                                             int>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<lite::TargetType::kLoongArch, T1, T2> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  lite::Tensor* output,
+                  lite::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(lite::TargetType::kLoongArch);
+    T2* mask_data = mask->mutable_data<T2>(lite::TargetType::kLoongArch);
+
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
+          }
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
+              }
+
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T1 ele = static_cast<T1>(-FLT_MAX);
+              int index = -1;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    if (ele < input_data[input_idx]) {
+                      index = input_idx;
+                      ele = input_data[input_idx];
+                    }
+                  }
+                }
+              }
+              output_data[output_idx] = ele;
+              mask_data[output_idx] = index;
+            }
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<lite::TargetType::kLoongArch, T1, T2> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& output_grad,
+                  const lite::Tensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  lite::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
+    const int output_channels = output_grad.dims()[1];
+    const int output_depth = output_grad.dims()[2];
+    const int output_height = output_grad.dims()[3];
+    const int output_width = output_grad.dims()[4];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(lite::TargetType::kLoongArch);
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            for (int pw = 0; pw < output_width; ++pw) {
+              const int output_idx =
+                  (pd * output_height + ph) * output_width + pw;
+              const int input_idx = static_cast<int>(mask_data[output_idx]);
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+            }
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dWithIndexFunctor<lite::TargetType::kLoongArch, float, int>;
+template class MaxPool3dWithIndexGradFunctor<lite::TargetType::kLoongArch,
+                                             float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<lite::TargetType::kLoongArch, double, int>;
+template class MaxPool3dWithIndexGradFunctor<lite::TargetType::kLoongArch,
+                                             double,
+                                             int>;
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/pooling.h b/lite/backends/loongarch/math/pooling.h
new file mode 100644
index 00000000000..30c926095a6
--- /dev/null
+++ b/lite/backends/loongarch/math/pooling.h
@@ -0,0 +1,246 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  DEVICE inline void finalize(const T& pool_field, T* y) {}
+};
+
+template <class T>
+class AvgPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(const T& x, T* y) { *y += x; }
+  DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; }
+};
+
+template <class T>
+class MaxPoolGrad {
+ public:
+  DEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale, T* dx) {
+    *dx += dy * (x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  DEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale, T* dx) {
+    *dx += (scale * dy);
+  }
+};
+
+/* used for adaptive pool to calculate start and end index of each divided grid
+ */
+HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
+ * number of channels, H and W is the height and width of feature.
+ * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
+ * number of channels, D, H and W is the depth, height and width of feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
+
+template <lite::TargetType Target, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor* input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  PoolProcess pool_compute,
+                  bool exclusive,
+                  bool adaptive,
+                  lite::Tensor* output);
+};
+
+template <lite::TargetType Target, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  PoolProcess pool_compute,
+                  bool exclusive,
+                  bool adaptive,
+                  lite::Tensor* input_grad);
+};
+
+template <lite::TargetType Target, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  lite::Tensor* input_grad);
+};
+
+template <lite::TargetType Target, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  PoolProcess pool_compute,
+                  bool exclusive,
+                  bool adaptive,
+                  lite::Tensor* output);
+};
+
+template <lite::TargetType Target, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  PoolProcess pool_compute,
+                  bool exclusive,
+                  bool adaptive,
+                  lite::Tensor* input_grad);
+};
+
+template <lite::TargetType Target, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  lite::Tensor* input_grad);
+};
+
+/*
+ * \brief Getting max pooling results and corresponding max index, and
+ * calculating gradient.
+ * In up-sampling-pooling, it is necessary to know max element index.
+ * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
+ * NCDHW format.
+ */
+template <lite::TargetType Target, typename T1, typename T2>
+class MaxPool2dWithIndexFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  lite::Tensor* output,
+                  lite::Tensor* mask);
+};
+
+template <lite::TargetType Target, typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& output_grad,
+                  const lite::Tensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  lite::Tensor* input_grad);
+};
+
+template <lite::TargetType Target, typename T1, typename T2>
+class MaxPool3dWithIndexFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  lite::Tensor* output,
+                  lite::Tensor* mask);
+};
+
+template <lite::TargetType Target, typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& output_grad,
+                  const lite::Tensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  lite::Tensor* input_grad);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/power.cc b/lite/backends/loongarch/math/power.cc
new file mode 100644
index 00000000000..7045104f253
--- /dev/null
+++ b/lite/backends/loongarch/math/power.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/power.h"
+#include "lite/backends/loongarch/xxl.h"
+#include <cmath>
+#include "lite/backends/loongarch/math/include/mathfuns.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <>
+void power<float>(const float* din,
+                  float* dout,
+                  const int num,
+                  float scale_,
+                  float shift_,
+                  float factor_) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  bool _do_power = true;
+  bool _do_scale = true;
+  bool _do_shift = true;
+  int rem_cnt = remain >> 2;
+  int rem_rem = remain & 3;
+  if (fabsf(factor_ - 1.f) < 1e-6f) {
+    _do_power = false;
+  }
+  if (fabsf(scale_ - 1.f) < 1e-6f) {
+    _do_scale = false;
+  }
+  if (fabsf(shift_ - 0.f) < 1e-6f) {
+    _do_shift = false;
+  }
+#ifdef __loongarch_asx
+  __m256 vscale_256 = lasx_set1_f32(scale_);
+  __m256 vshift_256 = lasx_set1_f32(shift_);
+  __m256 vfactor_256 = lasx_set1_f32(factor_);
+#endif
+  __m128 vscale = lsx_set1_f32(scale_);
+  __m128 vshift = lsx_set1_f32(shift_);
+  float* ptr_out = dout;
+  const float* ptr_in = din;
+  if (_do_power) {
+    for (int i = 0; i < cnt; i++) {
+#ifdef __loongarch_asx
+      __m256 vin0 = lasx_loadu_f32(ptr_in);
+      __m256 vin1 = lasx_loadu_f32(ptr_in + 8);
+      ptr_in += 16;
+      __m256 vsum0 = lasx_mul_f32(vin0, vscale_256);
+      __m256 vsum1 = lasx_mul_f32(vin1, vscale_256);
+      __m256 vres0 = lasx_add_f32(vsum0, vshift_256);
+      __m256 vres1 = lasx_add_f32(vsum1, vshift_256);
+      vres0 = pow256_ps(vres0, vfactor_256);
+      vres1 = pow256_ps(vres1, vfactor_256);
+      lasx_storeu_f32(ptr_out, vres0);
+      lasx_storeu_f32(ptr_out + 8, vres1);
+#else
+      __m128 vin0 = lsx_loadu_f32(ptr_in);
+      __m128 vin1 = lsx_loadu_f32(ptr_in + 4);
+      __m128 vin2 = lsx_loadu_f32(ptr_in + 8);
+      __m128 vin3 = lsx_loadu_f32(ptr_in + 12);
+      __m128 vsum0 = lsx_mul_f32(vin0, vscale);
+      __m128 vsum1 = lsx_mul_f32(vin1, vscale);
+      __m128 vsum2 = lsx_mul_f32(vin2, vscale);
+      __m128 vsum3 = lsx_mul_f32(vin3, vscale);
+      __m128 vres0 = lsx_add_f32(vsum0, vshift);
+      __m128 vres1 = lsx_add_f32(vsum1, vshift);
+      __m128 vres2 = lsx_add_f32(vsum2, vshift);
+      __m128 vres3 = lsx_add_f32(vsum3, vshift);
+
+      ptr_in += 16;
+      for (int j = 0; j < 4; j++) {
+        ptr_out[j] = std::pow((reinterpret_cast<float*>(&vres0))[j], factor_);
+        ptr_out[j + 4] =
+            std::pow((reinterpret_cast<float*>(&vres1))[j], factor_);
+        ptr_out[j + 8] =
+            std::pow((reinterpret_cast<float*>(&vres2))[j], factor_);
+        ptr_out[j + 12] =
+            std::pow((reinterpret_cast<float*>(&vres3))[j], factor_);
+      }
+#endif
+      ptr_out += 16;
+    }
+    for (int i = 0; i < rem_cnt; i++) {
+      __m128 vin0 = lsx_loadu_f32(ptr_in);
+      ptr_in += 4;
+      __m128 vsum0 = lsx_mul_f32(vin0, vscale);
+      __m128 vres0 = lsx_add_f32(vsum0, vshift);
+      for (int j = 0; j < 4; j++) {
+        ptr_out[j] = std::pow((reinterpret_cast<float*>(&vres0))[j], factor_);
+      }
+      ptr_out += 4;
+    }
+    for (int i = 0; i < rem_rem; i++) {
+      ptr_out[0] = std::pow((ptr_in[0] * scale_ + shift_), factor_);
+      ptr_in++;
+      ptr_out++;
+    }
+  } else {
+    for (int i = 0; i < cnt; i++) {
+#ifdef __loongarch_asx
+      __m256 vin0 = lasx_loadu_f32(ptr_in);
+      __m256 vin1 = lasx_loadu_f32(ptr_in + 8);
+      ptr_in += 16;
+      __m256 vsum0 = lasx_mul_f32(vin0, vscale_256);
+      __m256 vsum1 = lasx_mul_f32(vin1, vscale_256);
+      __m256 vres0 = lasx_add_f32(vsum0, vshift_256);
+      __m256 vres1 = lasx_add_f32(vsum1, vshift_256);
+      lasx_storeu_f32(ptr_out, vres0);
+      lasx_storeu_f32(ptr_out + 8, vres1);
+      ptr_out += 16;
+#else
+      __m128 vin0 = lsx_loadu_f32(ptr_in);
+      __m128 vin1 = lsx_loadu_f32(ptr_in + 4);
+      __m128 vin2 = lsx_loadu_f32(ptr_in + 8);
+      __m128 vin3 = lsx_loadu_f32(ptr_in + 12);
+      __m128 vsum0 = lsx_mul_f32(vin0, vscale);
+      __m128 vsum1 = lsx_mul_f32(vin1, vscale);
+      __m128 vsum2 = lsx_mul_f32(vin2, vscale);
+      __m128 vsum3 = lsx_mul_f32(vin3, vscale);
+      __m128 vres0 = lsx_add_f32(vsum0, vshift);
+      __m128 vres1 = lsx_add_f32(vsum1, vshift);
+      __m128 vres2 = lsx_add_f32(vsum2, vshift);
+      __m128 vres3 = lsx_add_f32(vsum3, vshift);
+
+      ptr_in += 16;
+      lsx_storeu_f32(ptr_out, vres0);
+      lsx_storeu_f32(ptr_out + 4, vres1);
+      lsx_storeu_f32(ptr_out + 8, vres2);
+      lsx_storeu_f32(ptr_out + 12, vres3);
+      ptr_out += 16;
+#endif
+    }
+    for (int i = 0; i < rem_cnt; i++) {
+      __m128 vin0 = lsx_loadu_f32(ptr_in);
+      ptr_in += 4;
+      __m128 vsum0 = lsx_mul_f32(vin0, vscale);
+      __m128 vres0 = lsx_add_f32(vsum0, vshift);
+
+      lsx_storeu_f32(ptr_out, vres0);
+      ptr_out += 4;
+    }
+    for (int i = 0; i < rem_rem; i++) {
+      ptr_out[0] = ptr_in[0] * scale_ + shift_;
+      ptr_in++;
+      ptr_out++;
+    }
+  }
+}
+
+} /* namespace math */
+} /* namespace loongarch */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/backends/loongarch/math/power.h b/lite/backends/loongarch/math/power.h
new file mode 100644
index 00000000000..0da9abc2c6a
--- /dev/null
+++ b/lite/backends/loongarch/math/power.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+void power(const T* din,
+           T* dout,
+           const int num,
+           float scale_,
+           float shift_,
+           float factor_);
+
+} /* namespace math */
+} /* namespace loongarch */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/backends/loongarch/math/prior_box.cc b/lite/backends/loongarch/math/prior_box.cc
new file mode 100644
index 00000000000..d8202c9f2f1
--- /dev/null
+++ b/lite/backends/loongarch/math/prior_box.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/prior_box.h"
+#include <algorithm>
+#include <string>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void density_prior_box(const int64_t img_width,
+                       const int64_t img_height,
+                       const int64_t feature_width,
+                       const int64_t feature_height,
+                       const float* input_data,
+                       const float* image_data,
+                       const bool clip,
+                       const std::vector<float> variances,
+                       const std::vector<float> fixed_sizes,
+                       const std::vector<float> fixed_ratios,
+                       const std::vector<int> densities,
+                       const float step_width,
+                       const float step_height,
+                       const float offset,
+                       const int num_priors,
+                       float* boxes_data,
+                       float* vars_data) {
+  int step_average = static_cast<int>((step_width + step_height) * 0.5);
+
+  std::vector<float> sqrt_fixed_ratios;
+  for (size_t i = 0; i < fixed_ratios.size(); i++) {
+    sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
+  }
+
+#pragma omp parallel for collapse(2)
+  for (int64_t h = 0; h < feature_height; ++h) {
+    for (int64_t w = 0; w < feature_width; ++w) {
+      float center_x = (w + offset) * step_width;
+      float center_y = (h + offset) * step_height;
+      int64_t offset = (h * feature_width + w) * num_priors * 4;
+      // Generate density prior boxes with fixed sizes.
+      for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+        auto fixed_size = fixed_sizes[s];
+        int density = densities[s];
+        int shift = step_average / density;
+        // Generate density prior boxes with fixed ratios.
+        for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+          float box_width_ratio = fixed_size * sqrt_fixed_ratios[r];
+          float box_height_ratio = fixed_size / sqrt_fixed_ratios[r];
+          float density_center_x = center_x - step_average / 2. + shift / 2.;
+          float density_center_y = center_y - step_average / 2. + shift / 2.;
+          for (int di = 0; di < density; ++di) {
+            for (int dj = 0; dj < density; ++dj) {
+              float center_x_temp = density_center_x + dj * shift;
+              float center_y_temp = density_center_y + di * shift;
+              boxes_data[offset++] = (std::max)(
+                  (center_x_temp - box_width_ratio / 2.) / img_width, 0.);
+              boxes_data[offset++] = (std::max)(
+                  (center_y_temp - box_height_ratio / 2.) / img_height, 0.);
+              boxes_data[offset++] = (std::min)(
+                  (center_x_temp + box_width_ratio / 2.) / img_width, 1.);
+              boxes_data[offset++] = (std::min)(
+                  (center_y_temp + box_height_ratio / 2.) / img_height, 1.);
+            }
+          }
+        }
+      }
+    }
+  }
+  //! clip the prior's coordinate such that it is within [0, 1]
+  if (clip) {
+    int channel_size = feature_height * feature_width * num_priors * 4;
+#pragma omp parallel for
+    for (int d = 0; d < channel_size; ++d) {
+      boxes_data[d] = (std::min)((std::max)(boxes_data[d], 0.f), 1.f);
+    }
+  }
+//! set the variance.
+#pragma omp parallel for collapse(3)
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      for (int i = 0; i < num_priors; ++i) {
+        int idx = ((h * feature_width + w) * num_priors + i) * 4;
+        vars_data[idx++] = variances[0];
+        vars_data[idx++] = variances[1];
+        vars_data[idx++] = variances[2];
+        vars_data[idx++] = variances[3];
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/prior_box.h b/lite/backends/loongarch/math/prior_box.h
new file mode 100644
index 00000000000..8953ba58aad
--- /dev/null
+++ b/lite/backends/loongarch/math/prior_box.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+void density_prior_box(const int64_t img_width,
+                       const int64_t img_height,
+                       const int64_t feature_width,
+                       const int64_t feature_height,
+                       const float* input_data,
+                       const float* image_data,
+                       const bool clip,
+                       const std::vector<float> variances,
+                       const std::vector<float> fixed_sizes,
+                       const std::vector<float> fixed_ratios,
+                       const std::vector<int> densities,
+                       const float step_width,
+                       const float step_height,
+                       const float offset,
+                       const int num_priors,
+                       float* boxes_data,
+                       float* vars_data);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/rnn.h b/lite/backends/loongarch/math/rnn.h
new file mode 100644
index 00000000000..639f61f196c
--- /dev/null
+++ b/lite/backends/loongarch/math/rnn.h
@@ -0,0 +1,564 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/backends/loongarch/math/activation_functions.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/xxl.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/log/logging.h"
+
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+namespace loongarch_forward = paddle::lite::loongarch::math::detail::forward;
+
+//**************************************
+// Class Def
+//**************************************
+template <class T>
+struct LstmMetaValue {
+  T* gate_value;
+  T* prev_state_value;
+  T* state_value;
+  T* state_active_value;
+  T* output_value;
+  T* check_ig;
+  T* check_fg;
+  T* check_og;
+};
+
+template <typename T>
+struct GRUMetaValue {
+  const T* gate_weight;
+  const T* state_weight;
+  const T* reset_bias;
+  T* gate_value;
+  T* reset_output_value;
+  T* output_value;
+  const T* prev_out_value;
+};
+
+//*********************************
+// Inline Function
+//*********************************
+// if v2 isn't null: out[i] = in[i] + v1[i] * v2[i];
+// if v2 is null:    out[i] = in[i] * v1[i];
+inline void vector_dot(float* out,
+                       const float* in,
+                       const float* v1,
+                       int size,
+                       const float* v2 = nullptr) {
+#ifdef __loongarch_asx
+  __m256 vec_in, vec_v1, vec_v2;
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_in_128, vec_v1_128, vec_v2_128;
+#endif
+
+  int i = 0;
+  if (nullptr == v2) {
+    i = 0;
+
+// in_out * v1
+#ifdef __loongarch_asx
+    for (; i + 7 < size; i += 8) {
+      vec_in = lasx_loadu_f32(in + i);
+      vec_v1 = lasx_loadu_f32(v1 + i);
+      lasx_storeu_f32(out + i, lasx_mul_f32(vec_in, vec_v1));
+    }
+#endif
+#ifdef __loongarch_sx
+    for (; i + 3 < size; i += 4) {
+      vec_in_128 = lsx_loadu_f32(in + i);
+      vec_v1_128 = lsx_loadu_f32(v1 + i);
+      lsx_storeu_f32(out + i, lsx_mul_f32(vec_in_128, vec_v1_128));
+    }
+#endif
+    for (; i < size; i++) {
+      out[i] = in[i] * v1[i];
+    }
+  } else {
+    i = 0;
+
+// in_out + v1 * v2
+#ifdef __loongarch_asx
+    for (; i + 7 < size; i += 8) {
+      vec_in = lasx_loadu_f32(in + i);
+      vec_v1 = lasx_loadu_f32(v1 + i);
+      vec_v2 = lasx_loadu_f32(v2 + i);
+      lasx_storeu_f32(out + i, lasx_fmadd_f32(vec_v2, vec_v1, vec_in));
+    }
+#endif
+#ifdef __loongarch_sx
+    for (; i + 3 < size; i += 4) {
+      vec_in_128 = lsx_loadu_f32(in + i);
+      vec_v1_128 = lsx_loadu_f32(v1 + i);
+      vec_v2_128 = lsx_loadu_f32(v2 + i);
+      lsx_storeu_f32(out + i, lsx_fmadd_f32(vec_v2_128, vec_v1_128, vec_in_128));
+    }
+#endif
+    for (; i < size; i++) {
+      out[i] = in[i] + v1[i] * v2[i];
+    }
+  }
+}
+
+inline void fill_bias_fc(float* out, const float* bias, int num, int channel) {
+#ifdef __loongarch_asx
+  __m256 vec_bias = {0.f};
+  __m256 vec_data = {0.f};
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_bias_128 = {0.f};
+  __m128 vec_data_128 = {0.f};
+#endif
+  int i = 0;
+
+  for (int j = 0; j < num; j++) {
+    float* ptr = out + j * channel;
+    const float* pbias = bias;
+    i = 0;
+
+#ifdef __loongarch_asx
+    for (; i + 7 < channel; i += 8) {
+      vec_bias = lasx_loadu_f32(pbias + i);
+      vec_data = lasx_loadu_f32(ptr + i);
+      lasx_storeu_f32(ptr + i, lasx_add_f32(vec_data, vec_bias));
+    }
+#endif
+#ifdef __loongarch_sx
+    for (; i + 3 < channel; i += 4) {
+      vec_bias_128 = lsx_loadu_f32(pbias + i);
+      vec_data_128 = lsx_loadu_f32(ptr + i);
+      lsx_storeu_f32(ptr + i, lsx_add_f32(vec_data_128, vec_bias_128));
+    }
+#endif
+    for (; i < channel; i++) {
+      *(ptr + i) = pbias[i] + ptr[i];
+    }
+  }
+}
+
+//*******************************
+// Template Func
+//*******************************
+template <typename T>
+void act_relu(const T* din, T* dout, int size, int threads) {
+  for (int i = 0; i < size; i++) {
+    dout[i] = loongarch_forward::Relu<T>(din[i]);
+  }
+}
+
+template <typename T>
+void act_sigmoid(const T* din, T* dout, int size, int threads) {
+  for (int i = 0; i < size; i++) {
+    dout[i] = loongarch_forward::Sigmoid<T>(din[i]);
+  }
+}
+
+template <typename T>
+void act_tanh(const T* din, T* dout, int size, int threads) {
+  for (int i = 0; i < size; i++) {
+    dout[i] = loongarch_forward::Tanh<T>(din[i]);
+  }
+}
+
+template <>
+void act_relu<float>(const float* din, float* dout, int size, int threads) {
+  int i = 0;
+#ifdef __loongarch_asx
+  for (; i + 7 < size; i += 8) {
+    __m256 a = lasx_loadu_f32(din + i);
+    lasx_storeu_f32(dout + i, loongarch_forward::lasx::Relu(a));
+  }
+#endif
+  for (; i < size; i++) {
+    dout[i] = loongarch_forward::Relu<float>(din[i]);
+  }
+}
+
+template <>
+void act_sigmoid<float>(const float* din, float* dout, int size, int threads) {
+  int i = 0;
+#ifdef __loongarch_asx
+  for (; i + 7 < size; i += 8) {
+    __m256 a = lasx_loadu_f32(din + i);
+    lasx_storeu_f32(dout + i, loongarch_forward::lasx::Sigmoid(a));
+  }
+#endif
+  for (; i < size; i++) {
+    dout[i] = loongarch_forward::Sigmoid<float>(din[i]);
+  }
+}
+
+template <>
+void act_tanh<float>(const float* din, float* dout, int size, int threads) {
+  int i = 0;
+#ifdef __loongarch_asx
+  for (; i + 7 < size; i += 8) {
+    __m256 a = lasx_loadu_f32(din + i);
+    lasx_storeu_f32(dout + i, loongarch_forward::lasx::Tanh(a));
+  }
+#endif
+  for (; i < size; i++) {
+    dout[i] = loongarch_forward::Tanh<float>(din[i]);
+  }
+}
+
+template <typename T>
+void activation(
+    const T* din, T* dout, int size, std::string act_str, int threads) {
+  if (act_str == "sigmoid") {
+    act_sigmoid(din, dout, size, threads);
+  } else if (act_str == "tanh") {
+    act_tanh(din, dout, size, threads);
+  } else if (act_str == "relu") {
+    act_relu(din, dout, size, threads);
+  } else {
+    LOG(FATAL) << "unsupport activation " << act_str;
+  }
+}
+
+template <typename T>
+void activation(const T* din,
+                T* dout,
+                int size,
+                lite_api::ActivationType act_type,
+                int threads) {
+  switch (act_type) {
+    case lite_api::ActivationType::kSigmoid:
+      act_sigmoid(din, dout, size, threads);
+      break;
+    case lite_api::ActivationType::kSigmoid_v2:
+      act_sigmoid(din, dout, size, threads);
+      break;
+    case lite_api::ActivationType::kTanh:
+      act_tanh(din, dout, size, threads);
+      break;
+    case lite_api::ActivationType::kTanh_v2:
+      act_tanh(din, dout, size, threads);
+      break;
+    case lite_api::ActivationType::kRelu:
+      act_relu(din, dout, size, threads);
+      break;
+    default:
+      LOG(FATAL) << "unsupport activation type:" << static_cast<int>(act_type);
+      break;
+  }
+}
+
+//***********************************
+// LSTM MODE
+//***********************************
+template <typename T>
+struct RnnLstmUnitFunctor {
+  static void compute(LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      lite_api::ActivationType gate_act,
+                      lite_api::ActivationType cell_act,
+                      lite_api::ActivationType cand_act,
+                      int threads) {
+    const int temp_len = frame_size;
+    auto zero_ptr = static_cast<float*>(
+        TargetMalloc(TARGET(kLoongArch), temp_len * sizeof(float)));
+    memset(zero_ptr, 0, sizeof(float) * temp_len);
+
+    for (int b = 0; b < batch_size; ++b) {
+      T* value_ig = value.gate_value;
+      T* value_fg = value_ig + frame_size;
+      T* value_in = value_fg + frame_size;
+      T* value_og = value_in + frame_size;
+      T* state = value.state_value;
+      T* state_act = value.state_active_value;
+
+      T* check_i = value.check_ig ? value.check_ig : zero_ptr;
+      T* check_f = value.check_fg ? value.check_fg : zero_ptr;
+      T* check_o = value.check_og ? value.check_og : zero_ptr;
+      T* prev_state =
+          value.prev_state_value ? value.prev_state_value : zero_ptr;
+
+      activation(value_in, value_in, frame_size, gate_act, threads);
+      vector_dot(value_ig, value_ig, prev_state, frame_size, check_i);
+      vector_dot(value_fg, value_fg, prev_state, frame_size, check_f);
+      activation(value_ig, value_ig, frame_size, cell_act, threads);
+      activation(value_fg, value_fg, frame_size, cell_act, threads);
+      vector_dot(state, value_in, value_ig, frame_size);
+      vector_dot(state, state, prev_state, frame_size, value_fg);
+
+      for (int i = 0; i < frame_size; ++i) {
+        if (cell_clip > 0.0) {
+          if (state[i] < -1.0 * cell_clip) {
+            state[i] = -1.0 * cell_clip;
+          }
+          if (state[i] > cell_clip) {
+            state[i] = cell_clip;
+          }
+        }
+      }
+
+      vector_dot(value_og, value_og, state, frame_size, check_o);
+      activation(value_og, value_og, frame_size, cell_act, threads);
+      activation(state, state_act, frame_size, cand_act, threads);
+      vector_dot(value.output_value, value_og, state_act, frame_size);
+
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+
+    TargetFree(TARGET(kLoongArch), zero_ptr);
+  }
+};
+
+//************************************
+// GRU MODE
+//************************************
+template <typename T>
+void GruRnnComputeKernel(GRUMetaValue<T> value,
+                         int frame_size,
+                         int batch_size,
+                         lite_api::ActivationType active_node,
+                         lite_api::ActivationType active_gate) {
+  auto value_reset_gate = value.gate_value;
+  auto value_update_gate = value.gate_value + frame_size;
+  auto value_reset_output = value.reset_output_value;
+  auto value_reset_bias = value.reset_bias;
+  auto cell_state_value = value.gate_value + 2 * frame_size;
+  auto value_output = value.output_value;
+  auto value_prev_out = value.prev_out_value;
+
+  for (int b = 0; b < batch_size; b++) {
+    activation(value_reset_gate,
+               value_reset_gate,
+               frame_size,
+               lite_api::ActivationType::kSigmoid_v2,
+               1);
+
+    activation(value_update_gate,
+               value_update_gate,
+               frame_size,
+               lite_api::ActivationType::kSigmoid_v2,
+               1);
+
+    for (int i = 0; i < frame_size; i++) {
+      value_reset_output[i] =
+          (value_reset_output[i] + value_reset_bias[i]) * value_reset_gate[i];
+      cell_state_value[i] += value_reset_output[i];
+    }
+
+    activation(cell_state_value,
+               cell_state_value,
+               frame_size,
+               lite_api::ActivationType::kTanh_v2,
+               1);
+
+    if (value.prev_out_value) {
+      for (int i = 0; i < frame_size; i++) {
+        value_output[i] = (1.f - value_update_gate[i]) * cell_state_value[i] +
+                          value_update_gate[i] * value_prev_out[i];
+      }
+    } else {
+      for (int i = 0; i < frame_size; i++) {
+        value_output[i] = (1.f - value_update_gate[i]) * cell_state_value[i];
+      }
+    }
+
+    value_reset_gate += frame_size * 3;
+    value_update_gate += frame_size * 3;
+    value_reset_output += frame_size;
+    cell_state_value += frame_size * 3;
+    value_output += frame_size;
+    if (value.prev_out_value) {
+      value_prev_out += frame_size;
+    }
+  }
+}
+
+template <>
+void GruRnnComputeKernel<float>(GRUMetaValue<float> value,
+                                int frame_size,
+                                int batch_size,
+                                lite_api::ActivationType active_node,
+                                lite_api::ActivationType active_gate) {
+  auto value_reset_gate = value.gate_value;
+  auto value_update_gate = value.gate_value + frame_size;
+  auto value_reset_output = value.reset_output_value;
+  auto value_reset_bias = value.reset_bias;
+  auto cell_state_value = value.gate_value + 2 * frame_size;
+  auto value_output = value.output_value;
+  auto value_prev_out = value.prev_out_value;
+  int i = 0;
+
+#ifdef __loongarch_asx
+  __m256 vec_one_256 = lasx_set1_f32(1.0f);
+#endif
+#ifdef __loongarch_sx
+  __m128 vec_one_128 = lsx_set1_f32(1.0f);
+#endif
+
+  for (int b = 0; b < batch_size; b++) {
+    activation(value_reset_gate,
+               value_reset_gate,
+               frame_size,
+               lite_api::ActivationType::kSigmoid_v2,
+               1);
+    activation(value_update_gate,
+               value_update_gate,
+               frame_size,
+               lite_api::ActivationType::kSigmoid_v2,
+               1);
+    i = 0;
+#ifdef __loongarch_asx
+    for (; i + 7 < frame_size; i += 8) {
+      __m256 vec_out = lasx_loadu_f32(value_reset_output + i);
+      __m256 vec_reset = lasx_loadu_f32(value_reset_gate + i);
+      __m256 vec_bias = lasx_loadu_f32(value_reset_bias + i);
+      vec_out = lasx_mul_f32(lasx_add_f32(vec_out, vec_bias), vec_reset);
+      lasx_storeu_f32(value_reset_output + i, vec_out);
+      lasx_storeu_f32(
+          cell_state_value + i,
+          lasx_add_f32(vec_out, lasx_loadu_f32(cell_state_value + i)));
+    }
+#endif
+#ifdef __loongarch_sx
+    for (; i + 3 < frame_size; i += 4) {
+      __m128 vec_out = lsx_loadu_f32(value_reset_output + i);
+      __m128 vec_reset = lsx_loadu_f32(value_reset_gate + i);
+      __m128 vec_bias = lsx_loadu_f32(value_reset_bias + i);
+      vec_out = lsx_mul_f32(lsx_add_f32(vec_out, vec_bias), vec_reset);
+      lsx_storeu_f32(value_reset_output + i, vec_out);
+      lsx_storeu_f32(cell_state_value + i,
+                    lsx_add_f32(vec_out, lsx_loadu_f32(cell_state_value + i)));
+    }
+#endif
+    for (; i < frame_size; i++) {
+      value_reset_output[i] =
+          (value_reset_output[i] + value_reset_bias[i]) * value_reset_gate[i];
+      cell_state_value[i] += value_reset_output[i];
+    }
+
+    activation(cell_state_value,
+               cell_state_value,
+               frame_size,
+               lite_api::ActivationType::kTanh_v2,
+               1);
+
+    if (value.prev_out_value) {
+      i = 0;
+#ifdef __loongarch_asx
+      for (; i + 7 < frame_size; i += 8) {
+        __m256 vec_vug = lasx_loadu_f32(value_update_gate + i);
+        __m256 vec_vpo = lasx_loadu_f32(value_prev_out + i);
+        __m256 vec_csv = lasx_loadu_f32(cell_state_value + i);
+        vec_vpo = lasx_mul_f32(vec_vug, vec_vpo);
+        __m256 vec_out = lasx_fmadd_f32(
+            vec_csv, lasx_sub_f32(vec_one_256, vec_vug), vec_vpo);
+        lasx_storeu_f32(value_output + i, vec_out);
+      }
+#endif
+#ifdef __loongarch_sx
+      for (; i + 3 < frame_size; i += 4) {
+        __m128 vec_vug = lsx_loadu_f32(value_update_gate + i);
+        __m128 vec_vpo = lsx_loadu_f32(value_prev_out + i);
+        __m128 vec_csv = lsx_loadu_f32(cell_state_value + i);
+        vec_vpo = lsx_mul_f32(vec_vug, vec_vpo);
+        __m128 vec_out = lsx_add_f32(
+            lsx_mul_f32(vec_csv, lsx_sub_f32(vec_one_128, vec_vug)), vec_vpo);
+        lsx_storeu_f32(value_output + i, vec_out);
+      }
+#endif
+      for (; i < frame_size; i++) {
+        value_output[i] = (1.f - value_update_gate[i]) * cell_state_value[i] +
+                          value_update_gate[i] * value_prev_out[i];
+      }
+    } else {
+      i = 0;
+#ifdef __loongarch_asx
+      for (; i + 7 < frame_size; i += 8) {
+        __m256 vec_vug = lasx_loadu_f32(value_update_gate + i);
+        __m256 vec_csv = lasx_loadu_f32(cell_state_value + i);
+        __m256 vec_out =
+            lasx_mul_f32(lasx_sub_f32(vec_one_256, vec_vug), vec_csv);
+        lasx_storeu_f32(value_output + i, vec_out);
+      }
+#endif
+#ifdef __loongarch_sx
+      for (; i + 3 < frame_size; i += 4) {
+        __m128 vec_vug = lsx_loadu_f32(value_update_gate + i);
+        __m128 vec_csv = lsx_loadu_f32(cell_state_value + i);
+        __m128 vec_out = lsx_mul_f32(lsx_sub_f32(vec_one_128, vec_vug), vec_csv);
+        lsx_storeu_f32(value_output + i, vec_out);
+      }
+#endif
+      for (; i < frame_size; i++) {
+        value_output[i] = (1.f - value_update_gate[i]) * cell_state_value[i];
+      }
+    }
+
+    value_reset_gate += frame_size * 3;
+    value_update_gate += frame_size * 3;
+    value_reset_output += frame_size;
+    cell_state_value += frame_size * 3;
+    value_output += frame_size;
+    if (value.prev_out_value) {
+      value_prev_out += frame_size;
+    }
+  }
+}
+
+template <typename T>
+struct RnnGruUnitFunctorV2 {
+  static void compute(LoongArchContext* ctx,
+                      GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      lite_api::ActivationType active_node,
+                      lite_api::ActivationType active_gate) {
+    if (value.prev_out_value) {
+      lite::loongarch::math::Blas<lite::TargetType::kLoongArch> matmul(*ctx);
+      matmul.GEMM<float>(false,
+                         true,
+                         batch_size,
+                         frame_size,
+                         frame_size,
+                         1.f,
+                         value.prev_out_value,
+                         frame_size,
+                         value.state_weight,
+                         frame_size,
+                         0.f,
+                         value.reset_output_value,
+                         frame_size);
+    }
+    GruRnnComputeKernel(
+        value, frame_size, batch_size, active_node, active_gate);
+  }
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sample_prob.cc b/lite/backends/loongarch/math/sample_prob.cc
new file mode 100644
index 00000000000..8f64c6af41a
--- /dev/null
+++ b/lite/backends/loongarch/math/sample_prob.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/sample_prob.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template class SampleWithProb<lite::TargetType::kLoongArch, float>;
+template class SampleWithProb<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sample_prob.h b/lite/backends/loongarch/math/sample_prob.h
new file mode 100644
index 00000000000..db968434fa0
--- /dev/null
+++ b/lite/backends/loongarch/math/sample_prob.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <set>
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/math/sampler.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/* UNDERSTAND: utility function to adjust probability for unique sampling,
+return whatever as it is if not using unique samping */
+template <typename T>
+static T adjust_prob(const T prob, const int num_samples, const int num_tries) {
+  if (num_samples == num_tries) {
+    return prob * num_samples;
+  } else {
+    return -expm1(num_tries * log1p(-prob));
+  }
+}
+
+template <lite::TargetType Target, typename T>
+class SampleWithProb {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const Sampler& sampler,
+                  const std::size_t num_samples,
+                  const lite::Tensor* L,
+                  lite::Tensor* S,
+                  lite::Tensor* P) {
+    // UNDERSTAND: dimension issues
+    const auto lbl_dim = L->dims();
+    const int batch_size = lbl_dim[0];
+    const int num_true = lbl_dim[1];
+    const int num_sampled_classes = num_true + num_samples;
+    // std::vector<int64_t> ret_dim_vec = {batch_size, num_sampled_classes};
+    // lite::DDim ret_dim(ret_dim_vec);
+
+    // UNDERSTAND: raw data view
+    const int64_t* label_data = L->data<int64_t>();
+    // int64_t* samples_data =
+    //    S->mutable_data<int64_t>(ret_dim, Target);
+    // T* probabilities_data = P->template mutable_data<T>(ret_dim, Target);
+    S->Resize({batch_size, num_sampled_classes});
+    auto* samples_data = S->mutable_data<int64_t>(Target);
+    P->Resize({batch_size, num_sampled_classes});
+    auto* probabilities_data = P->template mutable_data<T>(Target);
+
+    // temp sets for unique sampling
+    std::set<int64_t> tmp_samples;
+    int j = 0;  // column index
+    // add true labels, not that efficient
+    while (j < num_true) {
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + j;
+        auto v = label_data[i * num_true + j];
+        samples_data[samples_index] = v;
+        probabilities_data[samples_index] = sampler.Probability(v);
+      }
+      ++j;
+    }
+
+    // sample num_samles unique samples for an example, note that they are not
+    // all negative samples
+    tmp_samples.clear();
+    int num_tries = 0;
+    while (j < num_sampled_classes) {
+      ++num_tries;
+      auto v = sampler.Sample();
+      auto insert_ok = tmp_samples.insert(v).second;
+      if (!insert_ok) {
+        continue;
+      }
+      auto p = sampler.Probability(v);
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + j;
+        samples_data[samples_index] = v;
+        probabilities_data[samples_index] = p;
+      }
+      ++j;
+    }
+
+    // compute Q(y|x), because of unique sampling, probabilities need to be
+    // adjusted
+    for (int k = 0; k < num_sampled_classes; ++k) {
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + k;
+        probabilities_data[samples_index] = adjust_prob(
+            probabilities_data[samples_index], num_samples, num_tries);
+      }
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sampler.cc b/lite/backends/loongarch/math/sampler.cc
new file mode 100644
index 00000000000..8d71e4209e1
--- /dev/null
+++ b/lite/backends/loongarch/math/sampler.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/sampler.h"
+#include <iostream>
+#include <queue>
+#include <utility>
+#include <vector>
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+Sampler::~Sampler() {}
+
+UniformSampler::UniformSampler(int64_t range, unsigned int seed)
+    : Sampler(range, seed), inv_range_(1.0 / (range + 1)) {
+  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
+}
+
+int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
+
+float UniformSampler::Probability(int64_t value) const { return inv_range_; }
+
+LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed)
+    : Sampler(range, seed), log_range_(log(range + 1)) {
+  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
+}
+
+int64_t LogUniformSampler::Sample() const {
+  // Got Log Uniform distribution from uniform distribution by
+  // inverse_transform_sampling method
+  // More details:
+  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
+  const int64_t value =
+      static_cast<int64_t>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
+  // Mathematically, value should be <= range_, but might not be due to some
+  // floating point roundoff, so we mod by range_.
+  return value % range_;
+}
+
+float LogUniformSampler::Probability(int64_t value) const {
+  // Given f(x) = 1/[(x+1) * log_range_]
+  // The value's  probability  is integral of f(x) from value to (value + 1)
+  // More details:
+  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler
+  return (log((value + 2.0) / (value + 1.0))) / log_range_;
+}
+
+CustomSampler::CustomSampler(int64_t range,
+                             const float *probabilities,
+                             const int *alias,
+                             const float *alias_probabilities,
+                             unsigned int seed)
+    : Sampler(range, seed) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
+  int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
+
+  alias_probs_ = alias_probabilities;
+  probs_ = probabilities;
+  alias_ = alias;
+}
+
+int64_t CustomSampler::Sample() const {
+  auto index = (*int_dist_)(*random_engine_);
+  auto p = (*real_dist_)(*random_engine_);
+  if (p > alias_probs_[index]) {
+    int alias = alias_[index];
+
+    if (alias == exceptional_val) {
+      LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val;
+      return index;
+    }
+
+    return alias;
+  } else {
+    return index;
+  }
+}
+
+float CustomSampler::Probability(int64_t value) const { return probs_[value]; }
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sampler.h b/lite/backends/loongarch/math/sampler.h
new file mode 100644
index 00000000000..a18755fc904
--- /dev/null
+++ b/lite/backends/loongarch/math/sampler.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+// TODO(wanghaoshuang): Support for GPU
+
+/**
+* Sample integers from [0, range).
+*/
+class Sampler {
+ public:
+  explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
+    //    CHECK_GT(range, 0, "Range should be greater than 0.");
+    if (seed == 0) {
+      std::random_device r;
+      seed_ = r();
+    } else {
+      seed_ = seed;
+    }
+  }
+
+  virtual ~Sampler();
+
+  // Sample a single value
+  virtual int64_t Sample() const = 0;
+
+  // The probability that a single call to Sample() returns the given value.
+  virtual float Probability(int64_t value) const = 0;
+
+  int64_t range() { return range_; }
+
+ protected:
+  const int64_t range_;
+  unsigned int seed_;
+};
+
+/**
+ * Sample integers from [0, range).
+ * And the distribution function is:
+ * P(x) = 1 / range
+ */
+class UniformSampler : public Sampler {
+ public:
+  explicit UniformSampler(int64_t range, unsigned int seed = 0UL);
+
+  ~UniformSampler() override {}
+
+  int64_t Sample() const override;
+
+  float Probability(int64_t value) const override;
+
+ private:
+  const float inv_range_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
+  std::shared_ptr<std::uniform_int_distribution<>> dist_;
+};
+
+/**
+ * Sample integers from [0, range).
+ * And the distribution function is:
+ * P(x) = (1/ln(range+1)) * ln(1 + 1/(x + 1))
+ */
+class LogUniformSampler : public Sampler {
+ public:
+  explicit LogUniformSampler(int64_t range, unsigned int seed = 0UL);
+
+  ~LogUniformSampler() override {}
+
+  int64_t Sample() const override;
+
+  float Probability(int64_t value) const override;
+
+ private:
+  const float log_range_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
+  std::shared_ptr<std::uniform_real_distribution<>> dist_;
+};
+
+/**
+ * Sample integers from [0, range) from custom distribution.
+ */
+class CustomSampler : public Sampler {
+ public:
+  explicit CustomSampler(int64_t range,
+                         const float* probabilities,
+                         const int* alias,
+                         const float* alias_probabilities,
+                         unsigned int seed = 0UL);
+
+  ~CustomSampler() override {}
+
+  int64_t Sample() const override;
+
+  float Probability(int64_t value) const override;
+
+ private:
+  const float* alias_probs_;
+  const int* alias_;
+  const float* probs_;
+  const int exceptional_val = -1;
+  std::shared_ptr<std::mt19937> random_engine_;
+  std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
+  std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/saturate.h b/lite/backends/loongarch/math/saturate.h
new file mode 100644
index 00000000000..d2c511a1139
--- /dev/null
+++ b/lite/backends/loongarch/math/saturate.h
@@ -0,0 +1,320 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits.h>
+#include <algorithm>
+#include <cmath>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename _Tp>
+static inline _Tp saturate_cast(uint8_t v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(int8_t v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(uint16_t v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(int16_t v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(uint32_t v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(int32_t v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(float v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(double v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(int64_t v) {
+  return _Tp(v);
+}
+/** @overload */
+template <typename _Tp>
+static inline _Tp saturate_cast(uint64_t v) {
+  return _Tp(v);
+}
+
+template <>
+inline uint8_t saturate_cast<uint8_t>(int8_t v) {
+  return static_cast<uint8_t>(std::max(static_cast<int>(v), 0));
+}
+
+template <>
+inline uint8_t saturate_cast<uint8_t>(uint16_t v) {
+  return static_cast<uint8_t>(std::min((unsigned)v, (unsigned)UCHAR_MAX));
+}
+
+template <>
+inline uint8_t saturate_cast<uint8_t>(int v) {
+  return static_cast<uint8_t>(
+      ((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0));
+}
+
+template <>
+inline uint8_t saturate_cast<uint8_t>(int16_t v) {
+  return saturate_cast<uint8_t>(static_cast<int>(v));
+}
+
+template <>
+inline uint8_t saturate_cast<uint8_t>(unsigned v) {
+  return static_cast<uint8_t>(std::min(v, (unsigned)UCHAR_MAX));
+}
+template <>
+inline uint8_t saturate_cast<uint8_t>(float v) {
+  int iv = static_cast<int>(roundf(v));
+  return saturate_cast<uint8_t>(iv);
+}
+template <>
+inline uint8_t saturate_cast<uint8_t>(double v) {
+  int iv = static_cast<int>(round(v));
+  return saturate_cast<uint8_t>(iv);
+}
+template <>
+inline uint8_t saturate_cast<uint8_t>(int64_t v) {
+  return static_cast<uint8_t>(
+      ((uint64_t)v <= (uint64_t)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0));
+}
+template <>
+inline uint8_t saturate_cast<uint8_t>(uint64_t v) {
+  return static_cast<uint8_t>(std::min(v, (uint64_t)UCHAR_MAX));
+}
+
+template <>
+inline int8_t saturate_cast<int8_t>(uint8_t v) {
+  return static_cast<int8_t>(std::min(static_cast<int>(v), SCHAR_MAX));
+}
+template <>
+inline int8_t saturate_cast<int8_t>(uint16_t v) {
+  return static_cast<int8_t>(std::min((unsigned)v, (unsigned)SCHAR_MAX));
+}
+template <>
+inline int8_t saturate_cast<int8_t>(int v) {
+  return static_cast<int8_t>(((unsigned)(v - SCHAR_MIN) <= (unsigned)UCHAR_MAX
+                                  ? v
+                                  : v > 0 ? SCHAR_MAX : SCHAR_MIN));
+}
+template <>
+inline int8_t saturate_cast<int8_t>(int16_t v) {
+  return saturate_cast<int8_t>(static_cast<int>(v));
+}
+template <>
+inline int8_t saturate_cast<int8_t>(unsigned v) {
+  return static_cast<int8_t>(std::min(v, (unsigned)SCHAR_MAX));
+}
+template <>
+inline int8_t saturate_cast<int8_t>(float v) {
+  int iv = static_cast<int>(roundf(v));
+  return saturate_cast<int8_t>(iv);
+}
+template <>
+inline int8_t saturate_cast<int8_t>(double v) {
+  int iv = static_cast<int>(round(v));
+  return saturate_cast<int8_t>(iv);
+}
+template <>
+inline int8_t saturate_cast<int8_t>(int64_t v) {
+  return static_cast<int8_t>(
+      ((uint64_t)(static_cast<int64_t>(v) - SCHAR_MIN) <= (uint64_t)UCHAR_MAX
+           ? v
+           : v > 0 ? SCHAR_MAX : SCHAR_MIN));
+}
+template <>
+inline int8_t saturate_cast<int8_t>(uint64_t v) {
+  return static_cast<int8_t>(std::min(v, (uint64_t)SCHAR_MAX));
+}
+
+template <>
+inline uint16_t saturate_cast<uint16_t>(int8_t v) {
+  return static_cast<uint16_t>(std::max(static_cast<int>(v), 0));
+}
+
+template <>
+inline uint16_t saturate_cast<uint16_t>(int16_t v) {
+  return static_cast<uint16_t>(std::max(static_cast<int>(v), 0));
+}
+template <>
+inline uint16_t saturate_cast<uint16_t>(int v) {
+  return static_cast<uint16_t>(
+      (unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
+}
+template <>
+inline uint16_t saturate_cast<uint16_t>(unsigned v) {
+  return static_cast<uint16_t>(std::min(v, (unsigned)USHRT_MAX));
+}
+template <>
+inline uint16_t saturate_cast<uint16_t>(float v) {
+  int iv = static_cast<int>(roundf(v));
+  return saturate_cast<uint16_t>(iv);
+}
+template <>
+inline uint16_t saturate_cast<uint16_t>(double v) {
+  int iv = static_cast<int>(round(v));
+  return saturate_cast<uint16_t>(iv);
+}
+template <>
+inline uint16_t saturate_cast<uint16_t>(int64_t v) {
+  return static_cast<uint16_t>(
+      (uint64_t)v <= (uint64_t)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
+}
+template <>
+inline uint16_t saturate_cast<uint16_t>(uint64_t v) {
+  return static_cast<uint16_t>(std::min(v, (uint64_t)USHRT_MAX));
+}
+
+template <>
+inline int16_t saturate_cast<int16_t>(uint16_t v) {
+  return static_cast<int16_t>(std::min(static_cast<int>(v), SHRT_MAX));
+}
+template <>
+inline int16_t saturate_cast<int16_t>(int v) {
+  return static_cast<int16_t>((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX
+                                  ? v
+                                  : v > 0 ? SHRT_MAX : SHRT_MIN);
+}
+template <>
+inline int16_t saturate_cast<int16_t>(unsigned v) {
+  return (int16_t)std::min(v, (unsigned)SHRT_MAX);
+}
+template <>
+inline int16_t saturate_cast<int16_t>(float v) {
+  int iv = static_cast<int>(roundf(v));
+  return saturate_cast<int16_t>(iv);
+}
+template <>
+inline int16_t saturate_cast<int16_t>(double v) {
+  int iv = static_cast<int>(round(v));
+  return saturate_cast<int16_t>(iv);
+}
+template <>
+inline int16_t saturate_cast<int16_t>(int64_t v) {
+  return static_cast<int16_t>((uint64_t)((int64_t)v - SHRT_MIN) <=
+                                      (uint64_t)USHRT_MAX
+                                  ? v
+                                  : v > 0 ? SHRT_MAX : SHRT_MIN);
+}
+template <>
+inline int16_t saturate_cast<int16_t>(uint64_t v) {
+  return static_cast<int16_t>(std::min(v, (uint64_t)SHRT_MAX));
+}
+
+template <>
+inline int saturate_cast<int>(unsigned v) {
+  return static_cast<int>(std::min(v, (unsigned)INT_MAX));
+}
+template <>
+inline int saturate_cast<int>(int64_t v) {
+  return static_cast<int>((uint64_t)(v - INT_MIN) <= (uint64_t)UINT_MAX
+                              ? v
+                              : v > 0 ? INT_MAX : INT_MIN);
+}
+template <>
+inline int saturate_cast<int>(uint64_t v) {
+  return static_cast<int>(std::min(v, (uint64_t)INT_MAX));
+}
+template <>
+inline int saturate_cast<int>(float v) {
+  return static_cast<int>(roundf(v));
+}
+template <>
+inline int saturate_cast<int>(double v) {
+  return static_cast<int>(round(v));
+}
+
+template <>
+inline unsigned saturate_cast<unsigned>(int8_t v) {
+  return static_cast<unsigned>(std::max(v, static_cast<int8_t>(0)));
+}
+template <>
+inline unsigned saturate_cast<unsigned>(int16_t v) {
+  return static_cast<unsigned>(std::max(v, (int16_t)0));
+}
+template <>
+inline unsigned saturate_cast<unsigned>(int v) {
+  return static_cast<unsigned>(std::max(v, static_cast<int>(0)));
+}
+template <>
+inline unsigned saturate_cast<unsigned>(int64_t v) {
+  return static_cast<unsigned>(
+      (uint64_t)v <= (uint64_t)UINT_MAX ? v : v > 0 ? UINT_MAX : 0);
+}
+template <>
+inline unsigned saturate_cast<unsigned>(uint64_t v) {
+  return static_cast<unsigned>(std::min(v, (uint64_t)UINT_MAX));
+}
+// we intentionally do not clip negative numbers, to make -1 become 0xffffffff
+// etc.
+template <>
+inline unsigned saturate_cast<unsigned>(float v) {
+  return static_cast<unsigned>(roundf(v));
+}
+template <>
+inline unsigned saturate_cast<unsigned>(double v) {
+  return static_cast<unsigned>(round(v));
+}
+
+template <>
+inline uint64_t saturate_cast<uint64_t>(int8_t v) {
+  return static_cast<uint64_t>(std::max(v, static_cast<int8_t>(0)));
+}
+
+template <>
+inline uint64_t saturate_cast<uint64_t>(int16_t v) {
+  return static_cast<uint64_t>(std::max(v, (int16_t)0));
+}
+template <>
+inline uint64_t saturate_cast<uint64_t>(int v) {
+  return static_cast<uint64_t>(std::max(v, static_cast<int>(0)));
+}
+template <>
+inline uint64_t saturate_cast<uint64_t>(int64_t v) {
+  return static_cast<uint64_t>(std::max(v, (int64_t)0));
+}
+
+template <>
+inline int64_t saturate_cast<int64_t>(uint64_t v) {
+  return static_cast<int64_t>(std::min(v, (uint64_t)LLONG_MAX));
+}
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/search_fc.cc b/lite/backends/loongarch/math/search_fc.cc
new file mode 100644
index 00000000000..c9b803ab02b
--- /dev/null
+++ b/lite/backends/loongarch/math/search_fc.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/search_fc.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SearchFcFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& bottom,
+                  const lite::Tensor& w,
+                  const lite::Tensor& b,
+                  lite::Tensor* top,
+                  int out_size) {
+    int batch = bottom.dims()[0];
+
+    int _out = w.dims()[0];  // 100
+    int _in = w.dims()[1];   // 228
+
+    lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
+
+    const auto bottom_data = bottom.data<T>();
+    auto top_data = top->template mutable_data<T>(lite::TargetType::kLoongArch);
+    const auto weights = w.data<T>();
+    auto blas = math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    call_gemm<lite::LoongArchContext, T>(blas,
+                                   CblasNoTrans,
+                                   CblasTrans,
+                                   batch,
+                                   _out,
+                                   _in,
+                                   1.0f,
+                                   bottom_data,
+                                   weights,
+                                   0.0f,
+                                   top_data);
+    const auto* bias_data = b.data<T>();
+    for (int i = 0; i < batch; ++i) {
+      // add bias here
+      vector_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out);
+    }
+  }
+
+  // private:
+};
+
+#define DEFINE_FUNCTOR(type) \
+  template class SearchFcFunctor<lite::TargetType::kLoongArch, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/search_fc.h b/lite/backends/loongarch/math/search_fc.h
new file mode 100644
index 00000000000..73a978af240
--- /dev/null
+++ b/lite/backends/loongarch/math/search_fc.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/loongarch/fluid/data_type.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename DeviceContext, typename T>
+void call_gemm(const BlasT<lite::TargetType::kLoongArch, T> blas,
+               const CBLAS_TRANSPOSE TransA,
+               const CBLAS_TRANSPOSE TransB,
+               const int M,
+               const int N,
+               const int K,
+               const T alpha,
+               const T* A,
+               const T* B,
+               const T beta,
+               T* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+}
+
+static const unsigned int LASX_STEP_SIZE = 8;
+static const unsigned int LSX_STEP_SIZE = 4;
+static const unsigned int LASX_CUT_LEN_MASK = 7U;
+static const unsigned int LSX_CUT_LEN_MASK = 3U;
+
+template <typename T>
+inline void vector_eltadd(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(__loongarch_asx)
+  lll = len & ~LASX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += LASX_STEP_SIZE) {
+    lasx_storeu_f32(
+        z + jjj,
+        lasx_add_f32(lasx_loadu_f32(x + jjj), lasx_loadu_f32(y + jjj)));
+  }
+#elif defined(__loongarch_sx)
+  lll = len & ~LSX_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += LSX_STEP_SIZE) {
+    lsx_storeu_f32(z + jjj,
+                 lsx_add_f32(lsx_loadu_f32(x + jjj), lsx_loadu_f32(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] + y[jjj];
+  }
+}
+
+template <lite::TargetType Target, typename T>
+class SearchFcFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& X,
+                  const lite::Tensor& W,
+                  const lite::Tensor& b,
+                  lite::Tensor* Out,
+                  int out_size);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
+
+#define FOR_ALL_TYPES(macro) macro(float);
diff --git a/lite/backends/loongarch/math/selected_rows_functor.cc b/lite/backends/loongarch/math/selected_rows_functor.cc
new file mode 100644
index 00000000000..34d9e817d83
--- /dev/null
+++ b/lite/backends/loongarch/math/selected_rows_functor.cc
@@ -0,0 +1,436 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <map>
+#include <set>
+
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+struct SelectedRowsAdd<lite::TargetType::kLoongArch, T> {
+  void operator()(const lite::LoongArchContext& context,
+                  const fluid::SelectedRows& input1,
+                  const fluid::SelectedRows& input2,
+                  fluid::SelectedRows* output) {
+    auto in1_height = input1.height();
+    CHECK_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    CHECK_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    CHECK_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto* out_data = out_value->template mutable_data<T>();
+    auto* in1_data = in1_value.data<T>();
+    std::copy_n(in1_data, in1_value.numel(), out_data);
+
+    auto* in2_data = in2_value.data<T>();
+    std::copy_n(in2_data, in2_value.numel(), out_data + in1_value.numel());
+  }
+};
+
+template struct SelectedRowsAdd<lite::TargetType::kLoongArch, float>;
+template struct SelectedRowsAdd<lite::TargetType::kLoongArch, double>;
+
+template <typename T>
+struct SelectedRowsAddTensor<lite::TargetType::kLoongArch, T> {
+  void operator()(const lite::LoongArchContext& context,
+                  const fluid::SelectedRows& input1,
+                  const lite::Tensor& input2,
+                  lite::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    CHECK_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    CHECK_EQ(in1_row_numel, input2.numel() / in1_height);
+    CHECK_EQ(in1_row_numel, output->numel() / in1_height);
+
+    SetConstant<lite::TargetType::kLoongArch, T> functor;
+    functor(context, output, 0.0);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* out_data = output->template mutable_data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        out_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+
+    auto out_eigen = fluid::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = fluid::EigenVector<T>::Flatten(input2);
+    out_eigen.device(lite::fluid::EigenDeviceType<TARGET(kLoongArch)>()) =
+        out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<lite::TargetType::kLoongArch, float>;
+template struct SelectedRowsAddTensor<lite::TargetType::kLoongArch, double>;
+
+template <typename T>
+struct SelectedRowsAddTo<lite::TargetType::kLoongArch, T> {
+  void operator()(const lite::LoongArchContext& context,
+                  const fluid::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  fluid::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    CHECK_EQ(in1_height, input2->height());
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+
+    // concat rows
+    in2_rows.reserve(in2_rows.size() +
+                     size_t(in1_rows.end() - in1_rows.begin()));
+    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->template mutable_data<T>();
+    std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
+  }
+};
+
+template struct SelectedRowsAddTo<lite::TargetType::kLoongArch, float>;
+template struct SelectedRowsAddTo<lite::TargetType::kLoongArch, double>;
+template struct SelectedRowsAddTo<lite::TargetType::kLoongArch, int>;
+template struct SelectedRowsAddTo<lite::TargetType::kLoongArch, int64_t>;
+
+template <typename T>
+struct SelectedRowsSumTo<lite::TargetType::kLoongArch, T> {
+  void operator()(const lite::LoongArchContext& context,
+                  const std::vector<fluid::SelectedRows*>& input1,
+                  const std::vector<int64_t>& input2_offsets,
+                  fluid::SelectedRows* input2) {
+    // Ensure all selected rows have the same height
+    size_t size = 0u;
+    for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
+      auto& in_rows = (*iter)->rows();
+      size += in_rows.end() - in_rows.begin();
+      auto in1_height = (*iter)->height();
+      CHECK_EQ(in1_height, input2->height());
+    }
+    // concat rows
+    std::vector<int64_t> in2_rows;
+    in2_rows.reserve(in2_rows.size() + size);
+    for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
+      const std::vector<int64_t>& in_rows = (*iter)->rows();
+      in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end());
+    }
+    input2->set_rows(in2_rows);
+
+    auto* in2_value = input2->mutable_value();
+    T* in2_data = in2_value->template mutable_data<T>();
+    auto blas = math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    size_t offset = 0u;
+    for (size_t i = 0u; i != input1.size(); ++i) {
+      auto& in_value = input1[i]->value();
+      const T* in_data = in_value.data<T>();
+      offset += input2_offsets[i];
+      blas.VCOPY(in_value.numel(), in_data, in2_data + offset);
+    }
+  }
+};
+
+template struct SelectedRowsSumTo<lite::TargetType::kLoongArch, float>;
+template struct SelectedRowsSumTo<lite::TargetType::kLoongArch, double>;
+
+template <typename T>
+struct SelectedRowsAddToTensor<lite::TargetType::kLoongArch, T> {
+  void operator()(const lite::LoongArchContext& context,
+                  const fluid::SelectedRows& input1,
+                  lite::Tensor* input2) {
+    CHECK(input1.rows().size() != 0) << "input selected rows is empty!";
+
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    CHECK_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->template mutable_data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+  }
+};
+
+template struct SelectedRowsAddToTensor<lite::TargetType::kLoongArch, float>;
+template struct SelectedRowsAddToTensor<lite::TargetType::kLoongArch, double>;
+template struct SelectedRowsAddToTensor<lite::TargetType::kLoongArch, int>;
+template struct SelectedRowsAddToTensor<lite::TargetType::kLoongArch, int64_t>;
+
+// This is a separated namespace for manipulate SelectedRows typed
+// data. Like merge duplicated rows, adding two SelectedRows etc.
+//
+// Another group of functors is called "scatter updates", which means
+// use SelectedRows to update a dense tensor with different Ops, like
+// add or mul.
+namespace scatter {
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, lite::LoongArchContext>::value>::type
+elementwise_add_to(const DeviceContext& ctx,
+                   BlasT<lite::TargetType::kLoongArch, T>* blas,
+                   size_t data_len,
+                   const T* in,
+                   T* out) {
+  blas->AXPY(data_len, 1., in, out);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, lite::LoongArchContext>::value>::type
+elementwise_add_to(const DeviceContext& ctx,
+                   BlasT<lite::TargetType::kLoongArch, T>* blas,
+                   size_t data_len,
+                   const T* in,
+                   T* out) {
+  for (size_t i = 0; i < data_len; i++) {
+    out[i] += in[i];
+  }
+}
+
+template <typename T>
+struct MergeAdd<lite::TargetType::kLoongArch, T> {
+  fluid::SelectedRows operator()(const lite::LoongArchContext& context,
+                                 const fluid::SelectedRows& input,
+                                 const bool sorted_result = false) {
+    fluid::SelectedRows out;
+    (*this)(context, input, &out, sorted_result);
+    return out;
+  }
+
+  void operator()(const lite::LoongArchContext& context,
+                  const fluid::SelectedRows& input,
+                  fluid::SelectedRows* output,
+                  const bool sorted_result = false) {
+    std::vector<const fluid::SelectedRows*> inputs;
+    inputs.push_back(&input);
+    (*this)(context, inputs, output, sorted_result);
+  }
+
+  void operator()(const lite::LoongArchContext& context,
+                  const std::vector<const fluid::SelectedRows*>& inputs,
+                  fluid::SelectedRows* output,
+                  const bool sorted_result = false) {
+    if (inputs.size() == 0) {
+      VLOG(3) << "no input! return";
+      return;
+    }
+    const fluid::SelectedRows* has_value_input = nullptr;
+    for (auto* in : inputs) {
+      if (in->rows().size() > 0) {
+        has_value_input = in;
+        break;
+      }
+    }
+    if (has_value_input == nullptr) {
+      VLOG(3) << "no input has value! just return";
+      return;
+    }
+    auto input_width = has_value_input->value().dims()[1];
+    auto input_height = has_value_input->height();
+    fluid::SelectedRows& out = *output;
+    std::set<int64_t> merged_row_set;
+    size_t row_num = 0;
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      CHECK_EQ(input_width, input->value().dims()[1])
+          << "all input should have same "
+             "dimension except for the first one";
+      CHECK_EQ(input_height, input->height())
+          << "all input should have same height";
+      row_num += input->rows().size();
+      merged_row_set.insert(input->rows().begin(), input->rows().end());
+    }
+
+    out.set_height(input_height);
+    lite::DDim dims(std::vector<int64_t>(
+        {static_cast<int64_t>(merged_row_set.size()), input_width}));
+    out.mutable_value()->Resize(dims);
+    auto* out_data = out.mutable_value()->template mutable_data<T>();
+
+    if (merged_row_set.size() == row_num && !sorted_result) {
+      // no duplicated ids, just concat the result together
+      std::vector<int64_t> merge_rows;
+      merge_rows.reserve(row_num);
+      // concat rows
+      for (auto* in : inputs) {
+        merge_rows.insert(
+            merge_rows.end(), in->rows().begin(), in->rows().end());
+      }
+      out.set_rows(merge_rows);
+      int64_t copied_numel = 0;
+      for (auto* in : inputs) {
+        auto* in_data = in->value().data<T>();
+        auto in_numel = in->value().numel();
+        std::copy_n(in_data, in_numel, out_data + copied_numel);
+        copied_numel += in_numel;
+      }
+    } else {
+      std::vector<int64_t> merge_rows(merged_row_set.begin(),
+                                      merged_row_set.end());
+
+      if (sorted_result) {
+        std::stable_sort(merge_rows.begin(), merge_rows.end());
+      }
+
+      out.set_rows(merge_rows);
+      math::SetConstant<lite::TargetType::kLoongArch, T> constant_functor;
+      constant_functor(context, out.mutable_value(), 0.0);
+
+      std::map<int64_t, size_t> rows_to_id;
+      for (size_t i = 0; i < merge_rows.size(); ++i) {
+        rows_to_id[merge_rows[i]] = i;
+      }
+
+      auto blas = math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+      for (auto* input : inputs) {
+        if (input->rows().size() == 0) {
+          continue;
+        }
+        auto* input_data = input->value().data<T>();
+        auto& input_rows = input->rows();
+
+        for (size_t i = 0; i < input_rows.size(); i++) {
+          size_t out_i = rows_to_id[input_rows[i]];
+          elementwise_add_to<lite::LoongArchContext, T>(
+              context,
+              &blas,
+              static_cast<size_t>(input_width),
+              &input_data[i * input_width],
+              &out_data[out_i * input_width]);
+        }
+      }
+    }
+  }
+};
+
+template struct MergeAdd<lite::TargetType::kLoongArch, int>;
+template struct MergeAdd<lite::TargetType::kLoongArch, int64_t>;
+template struct MergeAdd<lite::TargetType::kLoongArch, float>;
+template struct MergeAdd<lite::TargetType::kLoongArch, double>;
+
+template <typename T>
+struct UpdateToTensor<lite::TargetType::kLoongArch, T> {
+  void operator()(const lite::LoongArchContext& context,
+                  const ScatterOps& op,
+                  const fluid::SelectedRows& input1,
+                  lite::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    CHECK_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->template data<T>();
+
+    // FIXME(typhoonzero): use macro fix the below messy code.
+    switch (op) {
+      case ScatterOps::ASSIGN:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::ADD:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUB:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] -=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUBBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] -
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+      case ScatterOps::MUL:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] *=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIV:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] /=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIVBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] /
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+    }
+  }
+};
+
+}  // namespace scatter
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/selected_rows_functor.h b/lite/backends/loongarch/math/selected_rows_functor.h
new file mode 100644
index 00000000000..5abfa9c8154
--- /dev/null
+++ b/lite/backends/loongarch/math/selected_rows_functor.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/fluid/selected_rows.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/core/context.h"
+
+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <lite::TargetType Target, typename T>
+struct SelectedRowsAdd {
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input1,
+                  const fluid::SelectedRows& input2,
+                  fluid::SelectedRows* output);
+};
+
+template <lite::TargetType Target, typename T>
+struct SelectedRowsAddTensor {
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input1,
+                  const lite::Tensor& input2,
+                  lite::Tensor* output);
+};
+
+// input2 = input1 + input2
+template <lite::TargetType Target, typename T>
+struct SelectedRowsAddTo {
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  fluid::SelectedRows* input2);
+};
+
+// input2 = [all input in input1] + input2
+template <lite::TargetType Target, typename T>
+struct SelectedRowsSumTo {
+  void operator()(const lite::Context<Target>& context,
+                  const std::vector<fluid::SelectedRows*>& input1,
+                  const std::vector<int64_t>& input2_offsets,
+                  fluid::SelectedRows* input2);
+};
+
+// FIXME: The result of SelectedRowsAddToTensor maybe non deterministic,
+// because it uses CudaAtomicAdd.
+// input2 = input1 + input2
+template <lite::TargetType Target, typename T>
+struct SelectedRowsAddToTensor {
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input1,
+                  lite::Tensor* input2);
+};
+
+namespace scatter {
+// functors for manuplating SelectedRows data
+template <lite::TargetType Target, typename T>
+struct MergeAdd {
+  // unary functor, merge by adding duplicated rows in
+  // the input SelectedRows object.
+  fluid::SelectedRows operator()(const lite::Context<Target>& context,
+                                 const fluid::SelectedRows& input,
+                                 const bool sorted_result = false);
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input,
+                  fluid::SelectedRows* output,
+                  const bool sorted_result = false);
+  void operator()(const lite::Context<Target>& context,
+                  const std::vector<const fluid::SelectedRows*>& inputs,
+                  fluid::SelectedRows* output,
+                  const bool sorted_result = false);
+};
+
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+
+// out = selected_rows_in / tensor
+template <lite::TargetType Target, typename T>
+struct UpdateToTensor {
+  void operator()(const lite::Context<Target>& context,
+                  const ScatterOps& op,
+                  const fluid::SelectedRows& input1,
+                  lite::Tensor* input2);
+};
+
+}  // namespace scatter
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sequence2batch.cc b/lite/backends/loongarch/math/sequence2batch.cc
new file mode 100644
index 00000000000..2d27e3c325d
--- /dev/null
+++ b/lite/backends/loongarch/math/sequence2batch.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/sequence2batch.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::Context<lite::TargetType::kLoongArch>& context,
+                  const lite::Tensor& src,
+                  const std::vector<uint64_t>& index_lod,
+                  lite::Tensor* dst,
+                  bool is_src_index) {
+    const uint64_t* index = index_lod.data();
+    const auto& src_dims = src.dims();
+    const auto& dst_dims = dst->dims();
+    CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2.";
+    CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2.";
+    CHECK_EQ(src_dims[1], dst_dims[1])
+        << "The width of src and dst must be same.";
+    auto height = dst_dims[0];
+    auto width = dst_dims[1];
+    auto* src_data = src.data<T>();
+    auto* dst_data = dst->template mutable_data<T>();
+    const int sz = width * sizeof(T);
+    if (is_src_index) {
+      for (int i = 0; i < height; ++i) {
+        memcpy(dst_data + i * width, src_data + index[i] * width, sz);
+      }
+    } else {
+      for (int i = 0; i < height; ++i) {
+        memcpy(dst_data + index[i] * width, src_data + i * width, sz);
+      }
+    }
+  }
+};
+
+template class CopyMatrixRowsFunctor<lite::TargetType::kLoongArch, float>;
+template class CopyMatrixRowsFunctor<lite::TargetType::kLoongArch, double>;
+
+template class LoDTensor2BatchFunctor<lite::TargetType::kLoongArch, float>;
+template class LoDTensor2BatchFunctor<lite::TargetType::kLoongArch, double>;
+template class Batch2LoDTensorFunctor<lite::TargetType::kLoongArch, float>;
+template class Batch2LoDTensorFunctor<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sequence2batch.h b/lite/backends/loongarch/math/sequence2batch.h
new file mode 100644
index 00000000000..d6af27f0e21
--- /dev/null
+++ b/lite/backends/loongarch/math/sequence2batch.h
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <lite::TargetType Target, typename T>
+class CopyMatrixRowsFunctor {
+ public:
+  // If is_src_index is true,
+  // copy the indexed rows of input src to the output dst.
+  // If is_src_index is false,
+  // copy the input src to the indexed rows of output dst.
+  // The indexed rows are based on the input index.
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& src,
+                  const std::vector<uint64_t>& index_lod,
+                  lite::Tensor* dst,
+                  bool is_src_index);
+};
+
+template <lite::TargetType Target, typename T>
+class LoDTensor2BatchFunctor {
+  // Calculate the length of each sequence and
+  // sort sequence index by the length.
+  // example:  sequences = {s0, s1, s2}
+  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+  //
+  struct SeqInfo {
+    SeqInfo() = default;
+    SeqInfo(int start, int length, int seq_idx)
+        : start(start), length(length), seq_idx(seq_idx) {}
+    int start;
+    int length;
+    int seq_idx;
+  };
+
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& lod_tensor,
+                  lite::Tensor* batch,
+                  bool is_cal_batch_lod,
+                  bool is_reverse = false) const {
+    if (!is_cal_batch_lod) {
+      auto lods = batch->lod();
+      CHECK_GT(lods.size(), 2UL)
+          << "The LoD of LoDTensor should inlcude at least 2-level "
+             "sequence information.";
+      CHECK_EQ(lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]))
+          << "The LoD information should be consistent with the dims.";
+      CopyMatrixRowsFunctor<Target, T> to_batch;
+      to_batch(context, lod_tensor, lods[1], batch, true);
+      return;
+    }
+
+    auto lods = lod_tensor.lod();
+    CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
+
+    const auto& lod = lods[0];
+
+    std::vector<SeqInfo> seq_info(lod.size() - 1);
+    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
+      int length = lod[seq_id + 1] - lod[seq_id];
+      seq_info[seq_id].start = lod[seq_id];
+      seq_info[seq_id].length = length;
+      seq_info[seq_id].seq_idx = seq_id;
+    }
+
+    std::stable_sort(seq_info.begin(),
+                     seq_info.end(),
+                     [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
+
+    // Calculate the start position of each batch.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           max_seqlen = 5,
+    //           batchIndex = {b0, b1, b2, b3, b4}
+    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+    //              batch_start_positions[0] = len(b0)
+    //              batch_start_positions[1] = len(b0) + len(b1)
+    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
+    //              ...
+    //           seq2batch_idx[12] = {4, 0, 9,
+    //                                5, 1, 10,
+    //                                6, 2, 11,
+    //                                7, 3,
+    //                                8}
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+    // The max_seqlen represents batch size after rearranging the
+    // input LodTensor. It is also the maximum length of input sequence.
+
+    LoD* batch_lods = batch->mutable_lod();
+    batch_lods->resize(3);
+
+    // batch_lods[0] is the start positions for batch LoDTensor
+    int max_seqlen = seq_info[0].length;
+    batch_lods->at(0).resize(static_cast<size_t>(max_seqlen + 1));
+    // batch_lods[1] is the raw index in the input LoDTensor
+    batch_lods->at(1).resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods->at(2).resize(seq_info.size());
+
+    auto* batch_starts = batch_lods->at(0).data();
+    auto* seq2batch_idx = batch_lods->at(1).data();
+    batch_starts[0] = 0;
+    for (int n = 0; n < max_seqlen; n++) {
+      auto batch_id = static_cast<int>(batch_starts[n]);
+      for (size_t i = 0; i < seq_info.size(); ++i) {
+        int seq_len = seq_info[i].length;
+        int start = seq_info[i].start;
+        if (n < seq_len) {
+          seq2batch_idx[batch_id] =
+              is_reverse ? start + seq_len - 1 - n : start + n;
+          batch_id++;
+        } else {
+          break;
+        }
+      }
+      batch_starts[n + 1] = static_cast<size_t>(batch_id);
+    }
+    auto* seq_order = batch_lods->at(2).data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
+
+    CopyMatrixRowsFunctor<Target, T> to_batch;
+    to_batch(context, lod_tensor, batch_lods->at(1), batch, true);
+  }
+};
+
+template <lite::TargetType Target, typename T>
+class Batch2LoDTensorFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& batch,
+                  lite::Tensor* lod_tensor) const {
+    auto in_lod = batch.lod();
+    CHECK_GT(in_lod.size(), 2UL)
+        << "The LoD of LoDTensor should inlcude at least 2-level "
+           "sequence information.";
+    CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
+        << "The LoD information should be consistent with the dims.";
+    CopyMatrixRowsFunctor<Target, T> to_seq;
+    to_seq(context, batch, in_lod[1], lod_tensor, false);
+  }
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sequence_pooling.cc b/lite/backends/loongarch/math/sequence_pooling.cc
new file mode 100644
index 00000000000..b93521708ae
--- /dev/null
+++ b/lite/backends/loongarch/math/sequence_pooling.cc
@@ -0,0 +1,416 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/jit/kernels.h"
+#include "lite/backends/loongarch/legacy_place.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/backends/loongarch/math/sequence_pooling.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = lite::fluid::EigenVector<T, MajorType, IndexType>;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, bool is_test>
+class MaxSeqPoolFunctor {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  T pad_value,
+                  lite::Tensor* output,
+                  lite::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    CHECK_GT(in_dims.size(), 1u);
+    CHECK_GT(out_dims.size(), 1u);
+    for (size_t i = 1; i < in_dims.size(); ++i) {
+      CHECK_EQ(in_dims[i], out_dims[i]);
+    }
+    CHECK_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[input.lod().size() - 1];
+    const T* in_data = input.data<T>();
+    T* out_data = output->template mutable_data<T>();
+    int* max_index = index->mutable_data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      if (starts[i] == starts[i + 1]) {
+        for (int64_t k = 0; k < dim; ++k) {
+          out_data[i * dim + k] = pad_value;
+          max_index[i * dim + k] = -1;
+        }
+        continue;
+      }
+      for (int64_t k = 0; k < dim; ++k) {
+        out_data[i * dim + k] = in_data[starts[i] * dim + k];
+        max_index[i * dim + k] = starts[i];
+      }
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+            max_index[i * dim + k] = j;
+          }
+        }
+      }
+    }
+  }
+};
+// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
+// index buffer
+template <typename T>
+class MaxSeqPoolFunctor<T, true> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  T pad_value,
+                  lite::Tensor* output,
+                  lite::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    CHECK_GT(in_dims.size(), 1u);
+    CHECK_GT(out_dims.size(), 1u);
+    for (size_t i = 1; i < in_dims.size(); ++i) {
+      CHECK_EQ(in_dims[i], out_dims[i]);
+    }
+    for (size_t i = 0; i < idx_dims.size(); ++i) {
+      CHECK_EQ(idx_dims[i], out_dims[i]);
+    }
+    auto starts = input.lod()[input.lod().size() - 1];
+    const T* in_data = input.data<T>();
+    T* out_data = output->template mutable_data<T>();
+    int* max_index = index->template mutable_data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      if (starts[i] == starts[i + 1]) {
+        for (int64_t k = 0; k < dim; ++k) {
+          out_data[i * dim + k] = pad_value;
+          max_index[i * dim + k] = -1;
+        }
+        continue;
+      }
+      std::memcpy(
+          &out_data[i * dim], &in_data[starts[i] * dim], dim * sizeof(T));
+      for (int64_t k = 0; k < dim; ++k) {
+        max_index[i * dim + k] = starts[i];
+      }
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+            max_index[i * dim + k] = j;
+          }
+        }
+      }
+    }
+  }
+};
+template <typename T>
+class MaxSeqPoolGradFunctor {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& out_grad,
+                  const lite::Tensor& index,
+                  lite::Tensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto ig_dims = in_grad->dims();
+    auto idx_dims = index.dims();
+    CHECK_GT(og_dims.size(), 1);
+    CHECK_GT(ig_dims.size(), 1);
+    for (size_t i = 1; i < og_dims.size(); ++i) {
+      CHECK_EQ(og_dims[i], ig_dims[i]);
+    }
+    CHECK_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->template mutable_data<T>();
+
+    SetConstant<TARGET(kLoongArch), T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t j = 0; j < dim; ++j) {
+        int step_id = max_index[i * dim + j];
+        if (step_id == -1) continue;
+        ig_data[step_id * dim + j] = og_data[i * dim + j];
+      }
+    }
+  }
+};
+
+template <typename T>
+class LastSeqPoolFunctor {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  T pad_value,
+                  lite::Tensor* output) {
+    // Create pointers to input and output data
+    auto* in_data = input.data<T>();
+    auto* out_data = output->template mutable_data<T>();
+
+    // Calculate the size of each item in sequence
+    int64_t item_size = input.numel() / input.dims()[0];
+    auto lod = input.lod()[input.lod().size() - 1];
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      // Calculate the length of each sequence
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      if (seq_len == 0) {
+        for (int j = 0; j < item_size; ++j) {
+          out_data[j] = pad_value;
+        }
+      } else {
+        // Point to the begin of next sequence
+        in_data += seq_len * item_size;
+        // Copy the last item of sequence to output
+        std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
+      }
+      out_data += item_size;
+    }
+  }
+};
+
+template <typename T>
+class FirstSeqPoolFunctor {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  T pad_value,
+                  lite::Tensor* output) {
+    // Create pointers to input and output data
+    auto* in_data = input.data<T>();
+    auto* out_data = output->template mutable_data<T>();
+
+    // Calculate the size of each item in sequence
+    int64_t item_size = input.numel() / input.dims()[0];
+    auto lod = input.lod()[input.lod().size() - 1];
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      // Calculate the length of each sequence
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      if (seq_len == 0) {
+        for (int j = 0; j < item_size; ++j) {
+          out_data[j] = pad_value;
+        }
+      } else {
+        // Copy the first item of sequence to output
+        std::memcpy(out_data, in_data, item_size * sizeof(T));
+        // Point to the next sequence
+        in_data += seq_len * item_size;
+      }
+      out_data += item_size;
+    }
+  }
+};
+
+template <typename T>
+class SumSeqPoolGradFunctor {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& out_grad,
+                  lite::Tensor* in_grad) {
+    auto lod = in_grad->lod()[0];
+    int64_t out_w = out_grad.numel() / out_grad.dims()[0];
+    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
+    CHECK(in_w == out_w);
+    const T* out_g_data = out_grad.data<T>();
+    T* in_g_data = in_grad->template mutable_data<T>(TARGET(kLoongArch));
+    auto blas = math::GetBlas<TARGET(kLoongArch), T>(context);
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      if (h == 0) continue;
+      int64_t in_offset = lod[i] * in_w;
+      const T* out_pos = out_g_data + i * out_w;
+      T* in_pos = in_g_data + in_offset;
+      for (int r = 0; r != h; ++r) {
+        blas.VCOPY(in_w, out_pos, in_pos + r * in_w);
+      }
+    }
+  }
+};
+
+template <typename T>
+class SequencePoolFunctor<TARGET(kLoongArch), T> {
+ public:
+  /* max pool has index output */
+  void operator()(const lite::LoongArchContext& context,
+                  const std::string pooltype,
+                  T pad_value,
+                  const lite::Tensor& input,
+                  lite::Tensor* output,
+                  bool is_test,
+                  lite::Tensor* index = nullptr) {
+    if (pooltype == "MAX") {
+      if (is_test) {
+        math::MaxSeqPoolFunctor<T, true> max_pool;
+        max_pool(context, input, pad_value, output, index);
+      } else {
+        math::MaxSeqPoolFunctor<T, false> max_pool;
+        max_pool(context, input, pad_value, output, index);
+      }
+      return;
+    }
+    if (pooltype == "LAST") {
+      math::LastSeqPoolFunctor<T> last_pool;
+      last_pool(context, input, pad_value, output);
+      return;
+    }
+    if (pooltype == "FIRST") {
+      math::FirstSeqPoolFunctor<T> first_pool;
+      first_pool(context, input, pad_value, output);
+      return;
+    }
+
+    auto lod = input.lod()[input.lod().size() - 1];
+    if (pooltype == "SUM") {
+      const T* src = input.data<T>();
+      T* dst = output->template mutable_data<T>(TARGET(kLoongArch));
+      jit::seq_pool_attr_t attr(
+          static_cast<int>(input.numel() / input.dims()[0]),
+          jit::SeqPoolType::kSum);
+      auto seqpool =
+          jit::KernelFuncs<jit::SeqPoolTuple<T>, lite::fluid::CPUPlace>::Cache()
+              .At(attr);
+      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+        attr.h = static_cast<int>(lod[i + 1] - lod[i]);
+        if (attr.h == 0) {
+          for (int j = 0; j < attr.w; ++j) {
+            dst[j] = pad_value;
+          }
+        } else {
+          seqpool(src, dst, &attr);
+        }
+        dst += attr.w;
+        src += attr.h * attr.w;
+      }
+      return;
+    }
+    auto eigen_device = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      Tensor out_t = output->Slice<float>(i, i + 1);
+      int64_t w = input.numel() / input.dims()[0];
+      if (lod[i] == lod[i + 1]) {
+        for (int j = 0; j < w; ++j) {
+          out_t.mutable_data<T>()[j] = pad_value;
+        }
+        continue;
+      }
+      Tensor in_t = input.Slice<float>(static_cast<int>(lod[i]),
+                                       static_cast<int>(lod[i + 1]));
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      auto in_e = EigenMatrix<T>::From(in_t, lite::DDim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
+      if (pooltype == "AVERAGE") {
+        out_e.device(eigen_device) = in_e.mean(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SQRT") {
+        out_e.device(eigen_device) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                                     std::sqrt(static_cast<T>(h));
+      } else {
+        LOG(FATAL) << "unsupported pooling pooltype";
+      }
+    }
+  }
+};
+
+template <typename T>
+class SequencePoolGradFunctor<TARGET(kLoongArch), T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const std::string pooltype,
+                  const lite::Tensor& out_grad,
+                  lite::Tensor* in_grad,
+                  /* max pool has index */
+                  const lite::Tensor* index = nullptr) {
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<T> max_pool_grad;
+      max_pool_grad(context, out_grad, *index, in_grad);
+      return;
+    }
+
+    if (pooltype == "LAST" || pooltype == "FIRST") {
+      // set X@Grad be zero at first when pooltype is LAST/FIRST
+      math::SetConstant<TARGET(kLoongArch), T> functor;
+      functor(context, in_grad, 0);
+    }
+
+    if (pooltype == "SUM") {
+      math::SumSeqPoolGradFunctor<T> sum_pool_grad;
+      sum_pool_grad(context, out_grad, in_grad);
+      return;
+    }
+
+    auto lod = in_grad->lod()[0];
+
+    auto eigen_device = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      if (lod[i] == lod[i + 1]) continue;
+      auto in_g_t = in_grad->Slice<float>(static_cast<int>(lod[i]),
+                                          static_cast<int>(lod[i + 1]));
+      auto out_g_t = out_grad.Slice<float>(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t w = in_grad->numel() / in_grad->dims()[0];
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, DDim({h, w}));
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, DDim({1, w}));
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
+      Eigen::DSizes<int, 2> bcast(h, 1);
+
+      if (pooltype == "AVERAGE") {
+        in_g_e.device(eigen_device) =
+            (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SQRT") {
+        in_g_e.device(eigen_device) =
+            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+      } else if (pooltype == "LAST") {
+        in_g_e.chip(h - 1, 0).device(eigen_device) = out_g_e_v;
+      } else if (pooltype == "FIRST") {
+        in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v;
+      } else {
+        LOG(FATAL) << "unsupported pooling pooltype";
+      }
+    }
+  }
+};
+
+template class SequencePoolFunctor<TARGET(kLoongArch), float>;
+// Note: these implementations have not been called yet
+// Template class SequencePoolFunctor<TARGET(kLoongArch), double>;
+// Template class SequencePoolGradFunctor<TARGET(kLoongArch), float>;
+// Template class SequencePoolGradFunctor<TARGET(kLoongArch), double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sequence_pooling.h b/lite/backends/loongarch/math/sequence_pooling.h
new file mode 100644
index 00000000000..c9f9913e7b6
--- /dev/null
+++ b/lite/backends/loongarch/math/sequence_pooling.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <lite::TargetType Target, typename T>
+class SequencePoolFunctor {
+ public:
+  /* max pool has index output */
+  void operator()(const lite::Context<Target>& context,
+                  const std::string pooltype,
+                  T pad_value,
+                  const lite::Tensor& input,
+                  lite::Tensor* output,
+                  bool is_test = false,
+                  lite::Tensor* index = nullptr);
+};
+
+template <lite::TargetType Target, typename T>
+class SequencePoolGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const std::string pooltype,
+                  const lite::Tensor& out_grad,
+                  lite::Tensor* in_grad,
+                  /* max pool has index */
+                  const lite::Tensor* index = nullptr);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sequence_scale.cc b/lite/backends/loongarch/math/sequence_scale.cc
new file mode 100644
index 00000000000..dda5d4fa065
--- /dev/null
+++ b/lite/backends/loongarch/math/sequence_scale.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/sequence_scale.h"
+#include "lite/backends/loongarch/fluid/lod.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+class ScaleLoDTensorFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::Context<lite::TargetType::kLoongArch>& context,
+                  const T* scales,
+                  lite::Tensor* seq) {
+    const size_t level = 0;
+    auto lod = seq->lod();
+    const size_t num_seq = lod[level].size() - 1;
+    size_t seq_width = seq->dims()[1];
+    lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
+
+    T* seq_data = seq->template mutable_data<T>(lite::TargetType::kLoongArch);
+    for (size_t i = 0; i < num_seq; ++i) {
+      for (size_t j = lod[level][i] * seq_width;
+           j < lod[level][i + 1] * seq_width;
+           ++j) {
+        seq_data[j] *= scales[i];
+      }
+    }
+  }
+};
+
+template class ScaleLoDTensorFunctor<lite::TargetType::kLoongArch, float>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sequence_scale.h b/lite/backends/loongarch/math/sequence_scale.h
new file mode 100644
index 00000000000..2fed28246c4
--- /dev/null
+++ b/lite/backends/loongarch/math/sequence_scale.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * \brief   Scale a sequence.
+ *
+ *  All sequences will be padded to the same length and stored in a transposed
+ * shape.
+ *  Example:
+ *    Given:
+ *      seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+ *      scales = (2, 3, 4, 5)
+ *    then:
+ *      result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3)
+
+ *
+ * \param context       Device context of this functor.
+ * \param seq           LoDTensor which is stored in sequence format, the shape
+ *                      is [total_sequence_length, sequence_width] where
+ *                      total_sequence_length is the sum of all sequences'
+ *                      length.
+ * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
+ * \param num_seq       Number of sequence
+ *
+ */
+
+template <lite::TargetType Target, typename T>
+class ScaleLoDTensorFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const T* scales,
+                  lite::Tensor* seq);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sequence_topk_avg_pooling.cc b/lite/backends/loongarch/math/sequence_topk_avg_pooling.cc
new file mode 100644
index 00000000000..c1c05515404
--- /dev/null
+++ b/lite/backends/loongarch/math/sequence_topk_avg_pooling.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/sequence_topk_avg_pooling.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T>
+void get_topk_pos(const T* data, int length, int k, int* pos, bool debug) {
+  size_t real_k = k < length ? k : length;
+
+  std::vector<T> v(data, data + length);
+
+  std::vector<int> topk_pos;
+  T min_val = -10000000.0;
+  while (topk_pos.size() < real_k) {
+    T max_val = min_val;
+    int max_pos = -1;
+    for (int i = 0; i < length; ++i) {
+      if (v[i] > max_val) {
+        max_pos = i;
+        max_val = v[i];
+      }
+    }
+
+    assert(max_pos >= 0);
+
+    topk_pos.push_back(max_pos);
+    v[max_pos] = min_val;
+  }
+
+  assert(topk_pos.size() > 0);
+  while (topk_pos.size() < (size_t)k) {
+    topk_pos.push_back(-1);
+  }
+
+  for (size_t i = 0; i < topk_pos.size(); ++i) {
+    pos[i] = topk_pos[i];
+  }
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SequenceTopkAvgPoolingFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::Tensor& in,
+                  const lite::Tensor& row,
+                  const lite::Tensor& col,
+                  lite::Tensor* out,
+                  lite::Tensor* pos,
+                  int channel_num,
+                  std::vector<int> topks) {
+    auto k_num = topks.size();
+    auto max_k = topks[topks.size() - 1];
+    std::vector<int64_t> vec_pos_shape;
+    auto in_lod = in.lod()[0];
+    auto row_lod = row.lod()[0];
+    auto col_lod = col.lod()[0];
+    int batch_size = row_lod.size() - 1;
+    int pos_total_size = row_lod[batch_size] * channel_num * max_k;
+    vec_pos_shape.push_back(pos_total_size);
+    lite::DDim dims(vec_pos_shape);
+    pos->Resize(dims);
+    auto pos_data = pos->mutable_data<int>(lite::TargetType::kLoongArch);
+
+    int offset = 0;
+    std::vector<uint64_t> vec_out_lod;
+    vec_out_lod.reserve(batch_size + 1);
+    for (int i = 0; i <= batch_size; ++i) {
+      offset = row_lod[i];
+      vec_out_lod.push_back(offset);
+    }
+
+    lite::LoD lod_temp;
+    lod_temp.push_back(vec_out_lod);
+    out->set_lod(lod_temp);
+
+    auto in_data = in.data<T>();
+    auto out_data = out->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    T* sum_data = new T[max_k];
+    for (int i = 0; i < batch_size; ++i) {
+      int total_size = in_lod[i + 1] - in_lod[i];
+      int row_size = row_lod[i + 1] - row_lod[i];
+      int col_size = col_lod[i + 1] - col_lod[i];
+
+      CHECK_EQ(total_size, channel_num * row_size * col_size)
+          << "size wrong in sequence_topk_avg_pooling_op!";
+
+      int feature_num = row_size * col_size;
+      for (int j = 0; j < channel_num; ++j) {
+        auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num;
+
+        for (int r = 0; r < row_size; ++r) {
+          auto row_data = input_offset_feature_data + r * col_size;
+          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
+                                r * channel_num * max_k + j * max_k;
+          auto out_slice_data = out_data + row_lod[i] * channel_num * k_num +
+                                r * channel_num * k_num + j * k_num;
+
+          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
+          if (pos_slice_data[0] == -1) {
+            sum_data[0] = 0.0;
+          } else {
+            sum_data[0] = row_data[pos_slice_data[0]];
+          }
+          for (int k = 1; k < max_k; ++k) {
+            if (pos_slice_data[k] == -1) {
+              sum_data[k] = sum_data[k - 1];
+            } else {
+              sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
+            }
+          }
+          for (size_t k = 0; k < k_num; ++k) {
+            out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
+          }
+        }
+      }
+    }
+    delete[] sum_data;
+  }
+};
+
+#define DEFINE_FUNCTOR(type) \
+  template class SequenceTopkAvgPoolingFunctor<lite::TargetType::kLoongArch, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/sequence_topk_avg_pooling.h b/lite/backends/loongarch/math/sequence_topk_avg_pooling.h
new file mode 100644
index 00000000000..3334ba8e982
--- /dev/null
+++ b/lite/backends/loongarch/math/sequence_topk_avg_pooling.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/loongarch/fluid/data_type.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+template <typename T>
+void get_topk_pos(
+    const T* data, int length, int k, int* pos, bool debug = false);
+
+template <lite::TargetType Target, typename T>
+class SequenceTopkAvgPoolingFunctor {
+ public:
+  void operator()(const lite::Tensor& X,
+                  const lite::Tensor& ROW,
+                  const lite::Tensor& COLUMN,
+                  lite::Tensor* Out,
+                  lite::Tensor* pos,
+                  int channel_num,
+                  std::vector<int> topks);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
+
+#define FOR_ALL_TYPES(macro) macro(float);
diff --git a/lite/backends/loongarch/math/softmax.cc b/lite/backends/loongarch/math/softmax.cc
new file mode 100644
index 00000000000..f33c16084eb
--- /dev/null
+++ b/lite/backends/loongarch/math/softmax.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/softmax.h"
+#include "lite/backends/loongarch/math/softmax_impl.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template class SoftmaxFunctor<lite::TargetType::kLoongArch, float, true>;
+// note: these implemetaions have not been called yet
+// template class SoftmaxFunctor<lite::TargetType::kLoongArch, float, false>;
+// template class SoftmaxFunctor<lite::TargetType::kLoongArch, double, true>;
+// template class SoftmaxFunctor<lite::TargetType::kLoongArch, double, false>;
+// template class SoftmaxGradFunctor<lite::TargetType::kLoongArch, float>;
+// template class SoftmaxGradFunctor<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/softmax.h b/lite/backends/loongarch/math/softmax.h
new file mode 100644
index 00000000000..665dcc312e5
--- /dev/null
+++ b/lite/backends/loongarch/math/softmax.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <lite::TargetType Target,
+          typename T,
+          bool is_test,
+          typename Enable = void>
+class SoftmaxFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const int axis_dim,
+                  const lite::Tensor* X,
+                  lite::Tensor* Y);
+};
+
+template <lite::TargetType Target, typename T, typename Enable = void>
+class SoftmaxGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const int axis_dim,
+                  const lite::TensorLite* y,
+                  const lite::TensorLite* y_grad,
+                  lite::TensorLite* x_grad);
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/softmax_impl.h b/lite/backends/loongarch/math/softmax_impl.h
new file mode 100644
index 00000000000..483f0a3aacf
--- /dev/null
+++ b/lite/backends/loongarch/math/softmax_impl.h
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/loongarch/cpu_info.h"
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/math/cpu_vec.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = static_cast<T>(-64.);
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <lite::TargetType Target, typename T, bool is_test>
+void SoftmaxEigen(const lite::Context<Target>& context,
+                  const int axis_dim,
+                  const lite::Tensor* X,
+                  lite::Tensor* Y) {
+  constexpr int kBatchDim = 0;
+  constexpr int kClassDim = 1;
+
+  auto logits = EigenMatrix<T>::From(*X);
+  auto softmax = EigenMatrix<T>::From(*Y);
+
+  const int batch_size = logits.dimension(kBatchDim);
+  const int num_classes = logits.dimension(kClassDim);
+  const int num_remain = num_classes / axis_dim;
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+  Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+  Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+  auto shifted_logits = (logits -
+                         logits.maximum(along_class)
+                             .eval()
+                             .reshape(batch_by_one)
+                             .broadcast(one_by_class))
+                            .unaryExpr(ValueClip<T>());
+
+  softmax.device(typename lite::fluid::EigenDevice<Target>::Type()) =
+      shifted_logits.exp();
+  softmax.device(typename lite::fluid::EigenDevice<Target>::Type()) =
+      (softmax *
+       softmax.reshape(batch_axis_remain)
+           .sum(along_class)
+           .inverse()
+           .eval()
+           .broadcast(one_axis));
+}
+
+template <lite::TargetType Target, typename T, bool is_test, typename Enable>
+void SoftmaxFunctor<Target, T, is_test, Enable>::operator()(
+    const lite::Context<Target>& context,
+    const int axis_dim,
+    const lite::Tensor* X,
+    lite::Tensor* Y) {
+  SoftmaxEigen<lite::Context<Target>, T, is_test>(context, axis_dim, X, Y);
+}
+
+template <lite::TargetType Target>
+using enable_if_CPU = typename std::enable_if<
+    std::is_same<lite::Context<Target>, lite::LoongArchContext>::value>::type;
+
+template <lite::TargetType Target, typename T, bool is_test>
+class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const int axis_dim,
+                  const lite::Tensor* X,
+                  lite::Tensor* Y) {
+    const auto& in_dims = X->dims();
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int num_classes = in_dims[kClassDim];
+    const int batch_size = in_dims[kBatchDim];
+    const int num_remain = num_classes / axis_dim;
+
+    if (num_remain == 1 && lite::loongarch::MayIUse(lite::loongarch::lasx)) {
+      const T* in_data = X->template data<T>();
+      auto* out_data = Y->template mutable_data<T>();
+      for (int bs = 0; bs < batch_size; ++bs) {
+        T max_val = *std::max_element(in_data, in_data + num_classes);
+        max_val *= static_cast<T>(-1);
+        vec_add_bias<T, lite::loongarch::lasx>(
+            num_classes, max_val, in_data, out_data);
+        vec_clip<T, lite::loongarch::lasx>(
+            num_classes, static_cast<T>(-64), out_data, out_data);
+        vec_exp<T>(num_classes, out_data, out_data);
+
+        T sum = 0;
+        vec_sum<T, lite::loongarch::lasx>(num_classes, out_data, &sum);
+        sum = static_cast<T>(1) / sum;
+        vec_scal<T, lite::loongarch::lasx>(num_classes, sum, out_data, out_data);
+
+        in_data += num_classes;
+        out_data += num_classes;
+      }
+    } else {
+      SoftmaxEigen<Target, T, is_test>(context, axis_dim, X, Y);
+    }
+  }
+};
+
+template <lite::TargetType Target>
+class SoftmaxFunctor<Target, float, true, enable_if_CPU<Target>> {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const int axis_dim,
+                  const lite::Tensor* X,
+                  lite::Tensor* Y) {
+    const auto& in_dims = X->dims();
+    const float* in_data = X->data<float>();
+    float* out_data = Y->mutable_data<float>();
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+    const int batch_size = in_dims[kBatchDim];
+    const int length = in_dims[kClassDim];
+    const int stride = in_dims[kClassDim] / axis_dim;
+    for (int bs = 0; bs < batch_size; ++bs) {
+      // get max value of input data
+      float in_max = -FLT_MAX;
+      for (int i = 0; i < length; ++i) {
+        in_max = (std::max)(in_max, in_data[i]);
+      }
+      // y = exp(x - in_max)
+      for (int i = 0; i < length; ++i) {
+        out_data[i] = static_cast<float>(std::exp(in_data[i] - in_max));
+      }
+      // y = y / sum(y[i], y[i + stride], y[i + stride + stride] ...)
+      for (int i = 0; i < stride; ++i) {
+        float sum = 0.f;
+        for (int j = 0; j < axis_dim; ++j) {
+          sum += out_data[i + j * stride];
+        }
+        for (int j = 0; j < axis_dim; ++j) {
+          out_data[i + j * stride] /= sum;
+        }
+      }
+      in_data += length;
+      out_data += length;
+    }
+  }
+};
+
+template <lite::TargetType Target, typename T>
+void SoftmaxGradEigen(const lite::Context<Target>& context,
+                      const int axis_dim,
+                      const lite::Tensor* y,
+                      const lite::Tensor* y_grad,
+                      lite::Tensor* x_grad) {
+  auto softmax = EigenMatrix<T>::From(*y);
+  auto softmax_grad = EigenMatrix<T>::From(*y_grad);
+  auto logits_grad = EigenMatrix<T>::From(*x_grad);
+
+  constexpr int kBatchDim = 0;
+  constexpr int kClassDim = 1;
+
+  const int batch_size = softmax.dimension(kBatchDim);
+  const int num_classes = softmax.dimension(kClassDim);
+  const int num_remain = num_classes / axis_dim;
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+  Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+  Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+  auto dot = (softmax * softmax_grad)
+                 .reshape(batch_axis_remain)
+                 .sum(along_class)
+                 .eval()
+                 .broadcast(one_axis);
+  // logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) *
+  // softmax;
+  logits_grad.device(typename lite::fluid::EigenDevice<Target>::Type()) =
+      (softmax_grad - dot) * softmax;
+}
+
+template <lite::TargetType Target, typename T, typename Enable>
+void SoftmaxGradFunctor<Target, T, Enable>::operator()(
+    const lite::Context<Target>& context,
+    const int axis_dim,
+    const lite::Tensor* y,
+    const lite::Tensor* y_grad,
+    lite::Tensor* x_grad) {
+  SoftmaxGradEigen<lite::Context<Target>, T>(
+      context, axis_dim, y, y_grad, x_grad);
+}
+
+template <lite::TargetType Target, typename T>
+class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const int axis_dim,
+                  const lite::Tensor* y,
+                  const lite::Tensor* y_grad,
+                  lite::Tensor* x_grad) {
+    auto out_dims = y->dims();
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    const int num_classes = out_dims[kClassDim];
+    const int batch_size = out_dims[kBatchDim];
+    const int num_remain = num_classes / axis_dim;
+
+    if (num_remain == 1 && lite::loongarch::MayIUse(lite::loongarch::lasx)) {
+      const T* out_data = y->template data<T>();
+      const T* out_grad = y_grad->template data<T>();
+      T* in_grad = x_grad->template mutable_data<T>();
+      for (int bs = 0; bs < batch_size; ++bs) {
+        T scalar;
+        vec_mul_reduce<T, lite::loongarch::lasx>(
+            num_classes, out_grad, out_data, &scalar);
+        scalar *= static_cast<T>(-1);
+        vec_add_bias<T, lite::loongarch::lasx>(num_classes, scalar, out_grad, in_grad);
+        vec_mul<T, lite::loongarch::lasx>(num_classes, out_data, in_grad, in_grad);
+        out_data += num_classes;
+        out_grad += num_classes;
+        in_grad += num_classes;
+      }
+    } else {
+      SoftmaxGradEigen<Target, T>(context, axis_dim, y, y_grad, x_grad);
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/tree2col.cc b/lite/backends/loongarch/math/tree2col.cc
new file mode 100644
index 00000000000..3ca43c97ac4
--- /dev/null
+++ b/lite/backends/loongarch/math/tree2col.cc
@@ -0,0 +1,205 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/tree2col.h"
+#include <deque>
+#include <stack>
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+std::vector<TreeNode> Tree2ColUtil::construct_patch(
+    size_t root, int max_depth, const std::vector<std::vector<int>> &tr) {
+  std::stack<TreeNode, std::deque<TreeNode>> stack;
+  std::map<int, bool> visited;
+  std::vector<TreeNode> patch;
+
+  stack.push(TreeNode(root, 1, 1, 0));
+  patch.emplace_back(TreeNode(root, 1, 1, 0));
+  visited[root] = true;
+
+  while (!stack.empty()) {
+    TreeNode &u = stack.top();
+    bool end = true;
+    size_t node = u.get_node(), sz = tr[node].size();
+    visited[node] = true;
+    for (size_t i = 0; i < sz; i++) {
+      size_t v = tr[node][i];
+      if (!visited[v] && static_cast<int>(u.get_depth()) + 1 < max_depth) {
+        visited[v] = true;
+        stack.push(TreeNode(v, i, sz, u.get_depth() + 1));
+        patch.push_back(TreeNode(v, i + 1, sz, u.get_depth() + 1));
+        end = false;
+      }
+    }
+    if (end) {
+      stack.pop();
+    }
+  }
+  return patch;
+}
+
+void Tree2ColUtil::construct_tree(const lite::Tensor &EdgeSet,
+                                  std::vector<std::vector<int>> *tr,
+                                  size_t *node_count) {
+  auto edge_set_dims = EdgeSet.dims();
+  CHECK_EQ(edge_set_dims[1], 2);
+  int64_t edge_count = EdgeSet.numel();
+
+  const int *edge_data = EdgeSet.data<int>();
+
+  for (int64_t i = 0; i < edge_count; i += 2) {
+    int u = edge_data[i], v = edge_data[i + 1];
+    if (u != 0 && v != 0) (*node_count)++;
+  }
+  (*node_count)++;
+
+  tr->resize(static_cast<size_t>(*node_count + 1));
+
+  for (int64_t i = 0; i < edge_count; i += 2) {
+    int u = edge_data[i], v = edge_data[i + 1];
+    if (u != 0 && v != 0) {
+      tr->at(u).push_back(v);
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename T>
+class Tree2ColFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext &context,
+                  const lite::Tensor &EdgeSet,
+                  const lite::Tensor &node_features,
+                  lite::Tensor *patch,
+                  int max_depth) {
+    std::vector<std::vector<int>> tr;
+    auto feature_dims = node_features.dims();
+    math::SetConstant<lite::TargetType::kLoongArch, T> constant;
+    int64_t feature_size = feature_dims[1];
+    size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
+    size_t node_count = 0, patch_count = 0, patch_size;
+    Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
+    std::vector<std::vector<TreeNode>> processing_list;
+    for (size_t u = 1; u <= node_count; u++) {
+      std::vector<TreeNode> temp_patch =
+          Tree2ColUtil::construct_patch(u, max_depth, tr);
+      if (!temp_patch.empty()) {
+        processing_list.emplace_back(temp_patch);
+      }
+    }
+    patch_size = processing_list.size();
+
+    // T *patch_data =
+    //    patch->template mutable_data<T>({static_cast<int64_t>(patch_size),
+    //                            static_cast<int64_t>(patch_elem_size)},
+    //                           cpu_place);
+    patch->Resize({static_cast<int64_t>(patch_size),
+                   static_cast<int64_t>(patch_elem_size)});
+    auto *patch_data = patch->template mutable_data<T>(lite::TargetType::kLoongArch);
+    constant(context, patch, 0);
+    const T *features = node_features.data<T>();
+
+    for (auto &patch_item : processing_list) {
+      size_t pointer_base = patch_count * patch_elem_size;
+      for (auto &v : patch_item) {
+        T eta_l = v.eta_l<T>(max_depth), eta_r = v.eta_r<T>(max_depth),
+          eta_t = v.eta_t<T>(max_depth);
+        size_t id = v.get_node() - 1;
+        for (int i = 0; i < feature_size; i++) {
+          patch_data[pointer_base + i * 3] +=
+              eta_l * features[id * feature_size + i];
+          patch_data[pointer_base + i * 3 + 1] +=
+              eta_r * features[id * feature_size + i];
+          patch_data[pointer_base + i * 3 + 2] +=
+              eta_t * features[id * feature_size + i];
+        }
+      }
+      patch_count++;
+    }
+    patch->Resize({static_cast<int64_t>(patch_count),
+                   static_cast<int64_t>(patch_elem_size)});
+  }
+};
+template <typename T>
+class Col2TreeFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext &context,
+                  const lite::Tensor &EdgeSet,
+                  const lite::Tensor &out_grad,
+                  lite::Tensor *in_grad,
+                  int max_depth) {
+    std::vector<std::vector<int>> tr;
+    auto output_dims = out_grad.dims();
+    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    math::SetConstant<lite::TargetType::kLoongArch, T> constant;
+    int64_t output_size = output_dims[1];
+    size_t grad_elem_size = 3 * static_cast<size_t>(output_size);
+    size_t node_count = 0, grad_count = 0;
+    Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
+    std::vector<std::vector<TreeNode>> processing_list;
+    std::vector<std::vector<TreeNode>> grad_list;
+    grad_list.resize(node_count);
+    for (size_t u = 1; u <= node_count; u++) {
+      std::vector<TreeNode> tmp =
+          Tree2ColUtil::construct_patch(u, max_depth, tr);
+      if (!tmp.empty()) {
+        processing_list.push_back(tmp);
+      }
+    }
+    for (size_t patch_id = 0; patch_id < processing_list.size(); patch_id++) {
+      for (auto v : processing_list[patch_id]) {
+        grad_list[v.get_node() - 1].push_back(v.change_node(patch_id + 1));
+      }
+    }
+    // T *grad_data =
+    //    in_grad->template mutable_data<T>({static_cast<int64_t>(node_count),
+    //                              static_cast<int64_t>(grad_elem_size)},
+    //                             cpu_place);
+    in_grad->Resize({static_cast<int64_t>(node_count),
+                     static_cast<int64_t>(grad_elem_size)});
+    auto *grad_data = in_grad->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    constant(context, in_grad, 0);
+    const T *out_g = out_grad.data<T>();
+    for (auto &patch_item : grad_list) {
+      size_t pointer_base = grad_count * grad_elem_size;
+      for (auto &v : patch_item) {
+        T eta_l = v.eta_l<T>(max_depth), eta_r = v.eta_r<T>(max_depth),
+          eta_t = v.eta_t<T>(max_depth);
+        size_t id = v.get_node() - 1;
+        for (int i = 0; i < output_size; i++) {
+          grad_data[pointer_base + i * 3] +=
+              eta_l * out_g[id * output_size + i];
+          grad_data[pointer_base + i * 3 + 1] +=
+              eta_r * out_g[id * output_size + i];
+          grad_data[pointer_base + i * 3 + 2] +=
+              eta_t * out_g[id * output_size + i];
+        }
+      }
+      grad_count++;
+    }
+  }
+};
+
+template class Tree2ColFunctor<lite::TargetType::kLoongArch, float>;
+template class Tree2ColFunctor<lite::TargetType::kLoongArch, double>;
+template class Col2TreeFunctor<lite::TargetType::kLoongArch, float>;
+template class Col2TreeFunctor<lite::TargetType::kLoongArch, double>;
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/tree2col.h b/lite/backends/loongarch/math/tree2col.h
new file mode 100644
index 00000000000..976c81968d2
--- /dev/null
+++ b/lite/backends/loongarch/math/tree2col.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <array>
+#include <map>
+#include <vector>
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+class TreeNode {
+ public:
+  size_t node;
+  explicit TreeNode(size_t node = 0,
+                    size_t index = 0,
+                    size_t pclen = 0,
+                    size_t depth = 0)
+      : node(node), index(index), pclen(pclen), depth(depth) {}
+  template <typename T>
+  T eta_t(T filter_depth) {
+    return ((filter_depth - this->depth) / filter_depth);
+  }
+  template <typename T>
+  T eta_l(T filter_depth) {
+    T temp;
+    if (this->pclen == 1) {
+      temp = 0.5;
+    } else {
+      temp = (this->index - 1.0) / (this->pclen - 1.0);
+    }
+    return (1.0 - this->eta_t<T>(filter_depth)) * temp;
+  }
+  template <typename T>
+  T eta_r(T filter_depth) {
+    return (1.0 - this->eta_t<T>(filter_depth)) *
+           (1.0 - this->eta_l<T>(filter_depth));
+  }
+  TreeNode change_node(size_t v) {
+    return TreeNode(v, this->index, this->pclen, this->depth);
+  }
+  size_t get_node() { return this->node; }
+  size_t get_depth() { return this->depth; }
+
+ private:
+  size_t index, pclen, depth;
+};
+class Tree2ColUtil {
+ public:
+  static std::vector<TreeNode> construct_patch(
+      size_t root, int max_depth, const std::vector<std::vector<int>> &tr);
+
+  static void construct_tree(const lite::Tensor &EdgeSet,
+                             std::vector<std::vector<int>> *tr,
+                             size_t *node_count);
+};
+
+template <lite::TargetType Target, typename T>
+class Tree2ColFunctor {
+ public:
+  void operator()(const lite::Context<Target> &context,
+                  const lite::Tensor &EdgeSet,
+                  const lite::Tensor &node_features,
+                  lite::Tensor *patch,
+                  int max_depth);
+};
+template <lite::TargetType Target, typename T>
+class Col2TreeFunctor {
+ public:
+  void operator()(const lite::Context<Target> &context,
+                  const lite::Tensor &EdgeSet,
+                  const lite::Tensor &out_grad,
+                  lite::Tensor *in_grad,
+                  int max_depth);
+};
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/unpooling.cc b/lite/backends/loongarch/math/unpooling.cc
new file mode 100644
index 00000000000..69b3f98d952
--- /dev/null
+++ b/lite/backends/loongarch/math/unpooling.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/unpooling.h"
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& indices,
+                  lite::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kLoongArch);
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          CHECK(index < output_feasize) << "err index in unpooling!";
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& indices,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  lite::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kLoongArch);
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          CHECK(index < output_feasize) << "err index in unpooling!";
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<lite::TargetType::kLoongArch, float>;
+template class Unpool2dMaxGradFunctor<lite::TargetType::kLoongArch, double>;
+template class Unpool2dMaxFunctor<lite::TargetType::kLoongArch, float>;
+template class Unpool2dMaxFunctor<lite::TargetType::kLoongArch, double>;
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/unpooling.h b/lite/backends/loongarch/math/unpooling.h
new file mode 100644
index 00000000000..e7bf7972178
--- /dev/null
+++ b/lite/backends/loongarch/math/unpooling.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+template <lite::TargetType Target, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& indices,
+                  lite::Tensor* output);
+};
+template <lite::TargetType Target, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& input,
+                  const lite::Tensor& indices,
+                  const lite::Tensor& output,
+                  const lite::Tensor& output_grad,
+                  lite::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/vol2col.cc b/lite/backends/loongarch/math/vol2col.cc
new file mode 100644
index 00000000000..6cd4cac79f3
--- /dev/null
+++ b/lite/backends/loongarch/math/vol2col.cc
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/loongarch/math/vol2col.h"
+#include <vector>
+#include "lite/utils/log/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+
+/*
+ * vol = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  lite::Tensor* col) const {
+    CHECK_EQ(vol.dims().size(), 4);
+    CHECK_EQ(col->dims().size(), 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    CHECK_EQ((input_depth + 2 * paddings[0] -
+              ((dilations[0] * (filter_depth - 1) + 1))) /
+                     strides[0] +
+                 1,
+             output_depth)
+        << "input_depth and output_depth are "
+           "mismatching.";
+    CHECK_EQ((input_height + 2 * paddings[1] -
+              ((dilations[1] * (filter_height - 1) + 1))) /
+                     strides[1] +
+                 1,
+             output_height)
+        << "input_height and output_height are "
+           "mismatching.";
+    CHECK_EQ((input_width + 2 * paddings[2] -
+              ((dilations[2] * (filter_width - 1) + 1))) /
+                     strides[2] +
+                 1,
+             output_width)
+        << "input_width and output_width are "
+           "mismatching.";
+
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->template mutable_data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx =
+                ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                    input_width +
+                w_pad;
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * vol = [input_channels,input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<lite::TargetType::kLoongArch, T> {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const lite::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  lite::Tensor* vol) const {
+    CHECK_EQ(vol->dims().size(), 4);
+    CHECK_EQ(col.dims().size(), 7);
+
+    int input_channels = vol->dims()[0];
+    int input_depth = vol->dims()[1];
+    int input_height = vol->dims()[2];
+    int input_width = vol->dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    CHECK_EQ((input_depth + 2 * paddings[0] -
+              ((dilations[0] * (filter_depth - 1) + 1))) /
+                     strides[0] +
+                 1,
+             output_depth)
+        << "input_depth and output_depth are "
+           "mismatching.";
+    CHECK_EQ((input_height + 2 * paddings[1] -
+              ((dilations[1] * (filter_height - 1) + 1))) /
+                     strides[1] +
+                 1,
+             output_height)
+        << "input_height and output_height are "
+           "mismatching.";
+    CHECK_EQ((input_width + 2 * paddings[2] -
+              ((dilations[2] * (filter_width - 1) + 1))) /
+                     strides[2] +
+                 1,
+             output_width)
+        << "input_width and output_width are "
+           "mismatching.";
+    T* vol_data = vol->template mutable_data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx =
+                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                      input_width +
+                  w_pad;
+
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Vol2ColFunctor<lite::TargetType::kLoongArch, float>;
+template class Vol2ColFunctor<lite::TargetType::kLoongArch, double>;
+template class Col2VolFunctor<lite::TargetType::kLoongArch, float>;
+template class Col2VolFunctor<lite::TargetType::kLoongArch, double>;
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/math/vol2col.h b/lite/backends/loongarch/math/vol2col.h
new file mode 100644
index 00000000000..3a07b747d67
--- /dev/null
+++ b/lite/backends/loongarch/math/vol2col.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+namespace math {
+/*
+ * \brief Converts the feature data of four dimensions(CDHW) into a colData of
+ *        seven dimensions in the Vol2ColFunctor calculation,
+ *        And in the Col2VolFunctor calculation, it is reversed.
+ *
+ * \param volData   Vol data.
+ * \param volShape  The shape of volData,
+ *                 [input_channels, input_depth, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 3-dimension  [stride_depth, stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 3-dimension  [d_pad, h_pad, w_pad].
+ *
+ * The shape of colData is:
+ * [input_channels, filter_depth, filter_height, filter_width, output_depth,
+ * output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_depth * filter_height * filter_width, and the width
+ * is equal output_depth * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_depth,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_depth,
+ *      output_height,
+ *      output_width]
+ *
+ * \note The caller needs to ensure that volShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <lite::TargetType Target, typename T>
+class Vol2ColFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  lite::Tensor* col) const;
+};
+
+template <lite::TargetType Target, typename T>
+class Col2VolFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  lite::Tensor* vol) const;
+};
+
+}  // namespace math
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/parallel.h b/lite/backends/loongarch/parallel.h
new file mode 100644
index 00000000000..a4ac3909ea9
--- /dev/null
+++ b/lite/backends/loongarch/parallel.h
@@ -0,0 +1,71 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#ifdef WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace loongarch {
+
+static void SetNumThreads(int num_threads) {
+#ifdef WITH_OMP
+  int real_num_threads = (std::max)(num_threads, 1);
+  omp_set_num_threads(real_num_threads);
+#endif
+}
+
+static inline int64_t GetMaxThreads() {
+  int64_t num_threads = 1;
+#ifdef WITH_OMP
+  // Do not support nested omp parallem.
+  num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
+#endif
+  return (std::max<int>)(num_threads, 1L);
+}
+
+using ThreadHandler =
+    std::function<void(const int64_t begin, const int64_t end)>;
+
+static inline void RunParallelFor(const int64_t begin,
+                                  const int64_t end,
+                                  const ThreadHandler& f) {
+  if (begin >= end) {
+    return;
+  }
+
+#ifdef WITH_OMP
+  int64_t num_threads = (std::min)(GetMaxThreads(), end - begin);
+  if (num_threads > 1) {
+#pragma omp parallel num_threads(num_threads)
+    {
+      int64_t tid = omp_get_thread_num();
+      int64_t chunk_size = (end - begin + num_threads - 1) / num_threads;
+      int64_t begin_tid = begin + tid * chunk_size;
+      f(begin_tid, (std::min)(end, chunk_size + begin_tid));
+    }
+    return;
+  }
+#endif
+
+  f(begin, end);
+}
+
+}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/port.h b/lite/backends/loongarch/port.h
new file mode 100644
index 00000000000..06536da94f8
--- /dev/null
+++ b/lite/backends/loongarch/port.h
@@ -0,0 +1,162 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+#include <cstdio>
+#include <stdexcept>
+
+#include <memory>
+#include <string>
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "lite/utils/log/cp_logging.h"
+
+#if !defined(_WIN32)
+#include <dlfcn.h>     //  dladdr
+#include <execinfo.h>  // backtrace
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <algorithm>  // std::accumulate
+#else
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL
+#include <io.h>  // _popen, _pclose
+#include <stdio.h>
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#include <windows.h>
+#include <winsock.h>
+#include <numeric>  // std::accumulate in msvc
+#undef min
+#undef max
+#ifndef S_ISDIR  // windows port for sys/stat.h
+#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+#endif  // S_ISDIR
+
+static void *dlsym(void *handle, const char *symbol_name) {
+  FARPROC found_symbol;
+  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+
+  if (found_symbol == NULL) {
+    throw std::runtime_error(std::string(symbol_name) + " not found.");
+  }
+  return reinterpret_cast<void *>(found_symbol);
+}
+
+static void *dlopen(const char *filename, int flag) {
+  std::string file_name(filename);
+  HMODULE hModule = LoadLibrary(file_name.c_str());
+#ifndef LITE_WITH_OPENCL
+  if (!hModule) {
+    throw std::runtime_error(file_name + " not found.");
+  }
+#endif
+  return reinterpret_cast<void *>(hModule);
+}
+
+#endif  // !_WIN32
+
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+#if !defined(_WIN32)
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+#else
+  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
+#endif  // _WIN32
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
+static bool PathExists(const std::string &path) {
+#if !defined(_WIN32)
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#else
+  struct _stat statbuf;
+  if (_stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#endif  // !_WIN32
+  return false;
+}
+
+// TODO(yuyang18): If the functions below are needed by other files, move them
+// to paddle::filesystem namespace.
+#if !defined(_WIN32)
+constexpr char kSEP = '/';
+#else
+constexpr char kSEP = '\\';
+#endif  // _WIN32
+
+static bool FileExists(const std::string &filepath) {
+#if !defined(_WIN32)
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+#else
+  struct _stat buffer;
+  return (_stat(filepath.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  std::string path_error(path);
+  path_error += " mkdir failed!";
+#if !defined(_WIN32)
+  if (mkdir(path, 0755)) {
+    if (errno != EEXIST) {
+      throw std::runtime_error(path_error);
+    }
+  }
+#else
+  BOOL return_value = CreateDirectory(path, NULL);
+  if (!return_value) {
+    auto errorno = GetLastError();
+    if (errorno != ERROR_ALREADY_EXISTS) {
+      throw std::runtime_error(path_error);
+    }
+  }
+#endif  // !_WIN32
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
diff --git a/lite/backends/loongarch/target_wrapper.cc b/lite/backends/loongarch/target_wrapper.cc
new file mode 100644
index 00000000000..91ab2afbecf
--- /dev/null
+++ b/lite/backends/loongarch/target_wrapper.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/target_wrapper.h"
+#include <algorithm>
+#include "lite/backends/loongarch/target_wrapper.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+
+template <>
+void TargetWrapper<TARGET(kLoongArch)>::MemcpySync(void *dst,
+                                             const void *src,
+                                             size_t size,
+                                             IoDirection dir) {
+  std::copy_n(reinterpret_cast<const uint8_t *>(src),
+              size,
+              reinterpret_cast<uint8_t *>(dst));
+}
+
+template class TargetWrapper<TARGET(kLoongArch)>;
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/target_wrapper.h b/lite/backends/loongarch/target_wrapper.h
new file mode 100644
index 00000000000..f0abcaddafd
--- /dev/null
+++ b/lite/backends/loongarch/target_wrapper.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace loongarch {}  // namespace loongarch
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/loongarch/warpctc_lib_path.h.in b/lite/backends/loongarch/warpctc_lib_path.h.in
new file mode 100644
index 00000000000..dc5064f4573
--- /dev/null
+++ b/lite/backends/loongarch/warpctc_lib_path.h.in
@@ -0,0 +1,17 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define WARPCTC_LIB_PATH "@WARPCTC_INSTALL_DIR@/lib/"
diff --git a/lite/backends/loongarch/xxl.h b/lite/backends/loongarch/xxl.h
new file mode 100644
index 00000000000..a1e3873738f
--- /dev/null
+++ b/lite/backends/loongarch/xxl.h
@@ -0,0 +1,930 @@
+#pragma once
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+#endif // __loongarch_asx
+
+#define XXL_INLINE extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+typedef long long int __m64;
+
+#define LSX_TRANSPOSE4_S(row0, row1, row2, row3)       \
+do {                                                   \
+  __m128i __r0 = (__m128i)(row0);                      \
+  __m128i __r1 = (__m128i)(row1);                      \
+  __m128i __r2 = (__m128i)(row2);                      \
+  __m128i __r3 = (__m128i)(row3);                      \
+  __m128i __t0 = __lsx_vilvl_w(__r1, __r0);            \
+  __m128i __t1 = __lsx_vilvh_w(__r1, __r0);            \
+  __m128i __t2 = __lsx_vilvl_w(__r3, __r2);            \
+  __m128i __t3 = __lsx_vilvh_w(__r3, __r2);            \
+  (row0) = (__m128)__lsx_vilvl_d(__t2, __t0);          \
+  (row1) = (__m128)__lsx_vilvh_d(__t2, __t0);          \
+  (row2) = (__m128)__lsx_vilvl_d(__t3, __t1);          \
+  (row3) = (__m128)__lsx_vilvh_d(__t3, __t1);          \
+} while (0)
+
+#define MASKCOPY(dst, src, mask, cnt_limit, elem_type)              \
+do {                                                                \
+  char *__dst = (char *)dst;                                        \
+  char *__src = (char *)src;                                        \
+  unsigned int __sz = sizeof(elem_type);                            \
+  for (int __i = 0; __i < cnt_limit; __i++) {                       \
+    if (!(mask & (1 << __i)))                                       \
+      continue;                                                     \
+    __builtin_memcpy(__dst + __i * __sz, __src + __i * __sz, __sz); \
+  }                                                                 \
+} while (0)
+
+
+#define LSX_SHUFFLE(a, b, c, d) ((a&3)*64+(b&3)*16+(c&3)*4+d)
+
+#ifdef __loongarch_asx
+XXL_INLINE __m256i __lasx_cvt_128_256(__m128i a) {
+  __m256i _0;
+  __builtin_memcpy(&_0, &a, sizeof(__m128i));
+  return _0;
+}
+XXL_INLINE __m128i __lasx_cvt_256_128(__m256i a) {
+  __m128i _0;
+  __builtin_memcpy(&_0, &a, sizeof(__m128i));
+  return _0;
+}
+XXL_INLINE __m256i __lasx_xvperm_q(__m256i a, __m256i b, int c) {
+  switch (((c>>2)&12)|(c&3)) {
+    case 0: return __lasx_xvpermi_q(a, b, 0);
+    case 1: return __lasx_xvpermi_q(a, b, 1);
+    case 2: return __lasx_xvpermi_q(a, b, 2);
+    case 3: return __lasx_xvpermi_q(a, b, 3);
+    case 4: return __lasx_xvpermi_q(a, b, 16);
+    case 5: return __lasx_xvpermi_q(a, b, 17);
+    case 6: return __lasx_xvpermi_q(a, b, 18);
+    case 7: return __lasx_xvpermi_q(a, b, 19);
+    case 8: return __lasx_xvpermi_q(a, b, 32);
+    case 9: return __lasx_xvpermi_q(a, b, 33);
+    case 10: return __lasx_xvpermi_q(a, b, 34);
+    case 11: return __lasx_xvpermi_q(a, b, 35);
+    case 12: return __lasx_xvpermi_q(a, b, 48);
+    case 13: return __lasx_xvpermi_q(a, b, 49);
+    case 14: return __lasx_xvpermi_q(a, b, 50);
+    case 15: return __lasx_xvpermi_q(a, b, 51);
+  }
+  __builtin_unreachable();
+}
+#endif // __loongarch_asx
+
+#define DEF_FCMP(pre, sub, typi, typo) \
+XXL_INLINE typo __##pre##fcmp_xxx_##sub(typi a, typi b, const int c) { \
+  switch (c&31) { \
+    case 0: return __##pre##fcmp_ceq_##sub(a, b); \
+    case 1: return __##pre##fcmp_slt_##sub(a, b); \
+    case 2: return __##pre##fcmp_sle_##sub(a, b); \
+    case 3: return __##pre##fcmp_cun_##sub(a, b); \
+    case 4: return __##pre##fcmp_cune_##sub(a, b); \
+    case 5: return __##pre##fcmp_sule_##sub(b, a); \
+    case 6: return __##pre##fcmp_sult_##sub(b, a); \
+    case 7: return __##pre##fcmp_cor_##sub(a, b); \
+    case 8: return __##pre##fcmp_cueq_##sub(a, b); \
+    case 9: return __##pre##fcmp_sult_##sub(a, b); \
+    case 10: return __##pre##fcmp_sule_##sub(a, b); \
+    case 11: return __##pre##fcmp_caf_##sub(a, b); \
+    case 12: return __##pre##fcmp_cne_##sub(a, b); \
+    case 13: return __##pre##fcmp_sle_##sub(b, a); \
+    case 14: return __##pre##fcmp_slt_##sub(b, a); \
+    case 15: return __##pre##xori_b(__##pre##fcmp_caf_##sub(a, b), 0xff); \
+    case 16: return __##pre##fcmp_seq_##sub(a, b); \
+    case 17: return __##pre##fcmp_clt_##sub(a, b); \
+    case 18: return __##pre##fcmp_cle_##sub(a, b); \
+    case 19: return __##pre##fcmp_sun_##sub(a, b); \
+    case 20: return __##pre##fcmp_sune_##sub(a, b); \
+    case 21: return __##pre##fcmp_cule_##sub(b, a); \
+    case 22: return __##pre##fcmp_cult_##sub(b, a); \
+    case 23: return __##pre##fcmp_sor_##sub(a, b); \
+    case 24: return __##pre##fcmp_sueq_##sub(a, b); \
+    case 25: return __##pre##fcmp_cult_##sub(a, b); \
+    case 26: return __##pre##fcmp_cule_##sub(a, b); \
+    case 27: return __##pre##fcmp_saf_##sub(a, b); \
+    case 28: return __##pre##fcmp_sne_##sub(a, b); \
+    case 29: return __##pre##fcmp_cle_##sub(b, a); \
+    case 30: return __##pre##fcmp_clt_##sub(b, a); \
+    case 31: return __##pre##xori_b(__##pre##fcmp_saf_##sub(a, b), 0xff); \
+  } \
+  __builtin_unreachable(); \
+}
+#ifdef __loongarch_asx
+DEF_FCMP(lasx_xv, d, __m256d, __m256i)
+DEF_FCMP(lasx_xv, s, __m256, __m256i)
+#endif // __loongarch_asx
+#ifdef __loongarch_sx
+DEF_FCMP(lsx_v, d, __m128d, __m128i)
+DEF_FCMP(lsx_v, s, __m128, __m128i)
+#endif // __loongarch_sx
+
+#if defined(__loongarch_asx) && defined(__loongarch_sx)
+XXL_INLINE __m128 lasx_extractf128_f32(__m256 a, const int b) {
+  return (__m128)__lasx_cvt_256_128(b&1?__lasx_xvpermi_q((__m256i)a, (__m256i)a, 1):(__m256i)a);
+}
+
+XXL_INLINE __m128i lasx_extractf128_m256i(__m256i a, const int b) {
+  return __lasx_cvt_256_128(b&1?__lasx_xvpermi_q(a, a, 1):a);
+}
+
+XXL_INLINE __m256 lasx_insertf128_f32(__m256 a, __m128 b, int c) {
+  __m256 _0;
+  _0 = (__m256)__lasx_xvpermi_q(a, __lasx_cvt_128_256((__m128i)b), c & 1 ? 2 : 48);
+  return _0;
+}
+
+XXL_INLINE __m128 lasx_castm256_m128(__m256 a) {
+  return (__m128)__lasx_cvt_256_128((__m256i)a);
+}
+
+XXL_INLINE __m128d lasx_castm256d_m128d(__m256d a) {
+  return (__m128d)__lasx_cvt_256_128((__m256i)a);
+}
+
+XXL_INLINE __m128i lasx_castm256i_m128i(__m256i a) {
+  return __lasx_cvt_256_128(a);
+}
+
+XXL_INLINE __m256i lasx_broadcastm128i_m256i(__m128i a) {
+  return __lasx_xvreplve0_q(__lasx_cvt_128_256(a));
+}
+
+XXL_INLINE __m128i lasx_extracti128_m256i(__m256i a, const int b) {
+  return __lasx_cvt_256_128(b&1?__lasx_xvpermi_q(a, a, 1):a);
+}
+
+XXL_INLINE __m256i lasx_inserti128_m256i(__m256i a, __m128i b, const int c) {
+  __m256i _0;
+  _0 = __lasx_xvpermi_q(a, __lasx_cvt_128_256(b), c & 1 ? 2 : 48);
+  return _0;
+}
+
+XXL_INLINE __m256i lasx_cvti8_i16(__m128i a) {
+  return __lasx_vext2xv_h_b(__lasx_cvt_128_256(a));
+}
+
+XXL_INLINE __m256i lasx_cvti8_i32(__m128i a) {
+  return __lasx_vext2xv_w_b(__lasx_cvt_128_256(a));
+}
+
+#endif // __loongarch_asx and __loongarch_sx
+
+#ifdef __loongarch_asx
+XXL_INLINE __m256d lasx_add_f64(__m256d a, __m256d b) {
+  return __lasx_xvfadd_d(a, b);
+}
+
+XXL_INLINE __m256 lasx_add_f32(__m256 a, __m256 b) {
+  return __lasx_xvfadd_s(a, b);
+}
+
+XXL_INLINE __m256d lasx_div_f64(__m256d a, __m256d b) {
+  return __lasx_xvfdiv_d(a, b);
+}
+
+XXL_INLINE __m256 lasx_div_f32(__m256 a, __m256 b) {
+  return __lasx_xvfdiv_s(a, b);
+}
+
+XXL_INLINE __m256d lasx_hadd_f64(__m256d a, __m256d b) {
+  return __lasx_xvfadd_d((__m256d)__lasx_xvpickev_d((__m256i)b, (__m256i)a), (__m256d)__lasx_xvpickod_d((__m256i)b, (__m256i)a));
+}
+
+XXL_INLINE __m256 lasx_hadd_f32(__m256 a, __m256 b) {
+  return __lasx_xvfadd_s((__m256)__lasx_xvpickev_w((__m256i)b, (__m256i)a), (__m256)__lasx_xvpickod_w((__m256i)b, (__m256i)a));
+}
+
+XXL_INLINE __m256d lasx_mul_f64(__m256d a, __m256d b) {
+  return __lasx_xvfmul_d(a, b);
+}
+
+XXL_INLINE __m256 lasx_mul_f32(__m256 a, __m256 b) {
+  return __lasx_xvfmul_s(a, b);
+}
+
+XXL_INLINE __m256d lasx_sub_f64(__m256d a, __m256d b) {
+  return __lasx_xvfsub_d(a, b);
+}
+
+XXL_INLINE __m256 lasx_sub_f32(__m256 a, __m256 b) {
+  return __lasx_xvfsub_s(a, b);
+}
+
+XXL_INLINE __m256 lasx_and_f32(__m256 a, __m256 b) {
+  return (__m256)__lasx_xvand_v((__m256i)a, (__m256i)b);
+}
+
+XXL_INLINE __m256 lasx_andnot_f32(__m256 a, __m256 b) {
+  return (__m256)__lasx_xvandn_v((__m256i)a, (__m256i)b);
+}
+
+XXL_INLINE __m256 lasx_or_f32(__m256 a, __m256 b) {
+  return (__m256)__lasx_xvor_v((__m256i)a, (__m256i)b);
+}
+
+XXL_INLINE __m256 lasx_xor_f32(__m256 a, __m256 b) {
+  return (__m256)__lasx_xvxor_v((__m256i)a, (__m256i)b);
+}
+
+XXL_INLINE __m256 lasx_blend_f32(__m256 a, __m256 b, const int c) {
+  return (__m256)__lasx_xvbitsel_v((__m256i)a, (__m256i)b, __lasx_vext2xv_w_b(__lasx_xvldi(c|0xf900)));
+}
+
+XXL_INLINE __m256 lasx_blendv_f32(__m256 a, __m256 b, __m256 c) {
+  return (__m256)__lasx_xvbitsel_v((__m256i)a, (__m256i)b, __lasx_xvslti_w((__m256i)c, 0));
+}
+
+XXL_INLINE __m256 lasx_shuffle_f32(__m256 a, __m256 b, const int c) {
+  return (__m256)__lasx_xvpermi_w((__m256i)b, (__m256i)a, (unsigned int)c);
+}
+
+XXL_INLINE __m256 lasx_permute2f128_f32(__m256 a, __m256 b, int c) {
+  __m256 _0;
+  __m256i _1 = __lasx_xvldi(0);
+  __m256i _2 = c&136 ? _1 : (__m256i)b;
+  __m256 _3 = c&128 ? (c&2?b:a) : c&0x8 ? (c&32?b:a) : a;
+  int _4 = c&128 ? ((c&1)|32) : c&0x8 ? ((c&16)|2) : c&51;
+  _0 = (c&136)==136 ? (__m256)_1 : (__m256)__lasx_xvperm_q(_2, (__m256i)_3, _4);
+  return _0;
+}
+
+XXL_INLINE __m256d lasx_permute2f128_f64(__m256d a, __m256d b, int c) {
+  __m256d _0;
+  __m256i _1 = __lasx_xvldi(0);
+  __m256i _2 = c&136 ? _1 : (__m256i)b;
+  __m256d _3 = c&128 ? (c&2?b:a) : c&0x8 ? (c&32?b:a) : a;
+  int _4 = c&128 ? ((c&1)|32) : c&0x8 ? ((c&16)|2) : c&51;
+  _0 = (c&136)==136 ? (__m256d)_1 : (__m256d)__lasx_xvperm_q(_2, (__m256i)_3, _4);
+  return _0;
+}
+
+XXL_INLINE __m256 lasx_unpackhi_f32(__m256 a, __m256 b) {
+  return (__m256)__lasx_xvilvh_w((__m256i)b, (__m256i)a);
+}
+
+XXL_INLINE __m256 lasx_unpacklo_f32(__m256 a, __m256 b) {
+  return (__m256)__lasx_xvilvl_w((__m256i)b, (__m256i)a);
+}
+
+XXL_INLINE __m256d lasx_max_f64(__m256d a, __m256d b) {
+  return __lasx_xvfmax_d(a, b);
+}
+
+XXL_INLINE __m256 lasx_max_f32(__m256 a, __m256 b) {
+  return __lasx_xvfmax_s(a, b);
+}
+
+XXL_INLINE __m256 lasx_min_f32(__m256 a, __m256 b) {
+  return __lasx_xvfmin_s(a, b);
+}
+
+XXL_INLINE __m256 lasx_floor_f32(__m256 a) {
+  return __lasx_xvfrintrm_s(a);
+}
+
+XXL_INLINE __m256 lasx_cmp_f32(__m256 a, __m256 b, const int c) {
+  return (__m256)__lasx_xvfcmp_xxx_s(a, b, c);
+}
+
+XXL_INLINE __m256 lasx_xvfcmp_slt_s(__m256 a, __m256 b) {
+  return (__m256)__lasx_xvfcmp_slt_s(a, b);
+}
+
+XXL_INLINE __m256 lasx_xvfcmp_sle_s(__m256 a, __m256 b) {
+  return (__m256)__lasx_xvfcmp_sle_s(a, b);
+}
+
+XXL_INLINE __m256 lasx_cvti32_f32(__m256i a) {
+  return __lasx_xvffint_s_w(a);
+}
+
+XXL_INLINE __m256i lasx_cvtf32_i32(__m256 a) {
+  return __lasx_xvftint_w_s(a);
+}
+
+XXL_INLINE __m256i lasx_cvttf32_i32(__m256 a) {
+  return __lasx_xvftintrz_w_s(a);
+}
+
+XXL_INLINE __m256 lasx_broadcast_1f32(float const * a) {
+  return (__m256)__lasx_xvldrepl_w((void *)a, 0);
+}
+
+XXL_INLINE __m256d lasx_broadcast_sd(double const * a) {
+  return (__m256d)__lasx_xvldrepl_d((void *)a, 0);
+}
+
+XXL_INLINE __m256 lasx_load_f32(float const * a) {
+  return (__m256)__lasx_xvld((void *)a, 0);
+}
+
+XXL_INLINE __m256d lasx_loadu_f64(double const * a) {
+  return (__m256d)__lasx_xvld((void *)a, 0);
+}
+
+XXL_INLINE __m256 lasx_loadu_f32(float const * a) {
+  return (__m256)__lasx_xvld((void *)a, 0);
+}
+
+XXL_INLINE __m256i lasx_loadu_m256i(__m256i const * a) {
+  return __lasx_xvld((void *)a, 0);
+}
+
+XXL_INLINE void lasx_storeu_f64(double * a, __m256d b) {
+  return __lasx_xvst((__m256i)b, (void *)a, 0);
+}
+
+XXL_INLINE void lasx_storeu_f32(float * a, __m256 b) {
+  return __lasx_xvst((__m256i)b, (void *)a, 0);
+}
+
+XXL_INLINE void lasx_storeu_m256i(__m256i * a, __m256i b) {
+  return __lasx_xvst(b, (void *)a, 0);
+}
+
+XXL_INLINE void lasx_maskstore_f32(float * a, __m256i b, __m256 c) {
+  __m256i vmask = __lasx_xvmskltz_w(b);
+  int mask = __lasx_xvpickve2gr_w(vmask, 0) | (__lasx_xvpickve2gr_w(vmask, 4) << 4);
+  MASKCOPY(a, &c, mask, 8, float);
+}
+
+XXL_INLINE __m256d lasx_sqrt_f64(__m256d a) {
+  return __lasx_xvfsqrt_d(a);
+}
+
+XXL_INLINE __m256 lasx_sqrt_f32(__m256 a) {
+  return __lasx_xvfsqrt_s(a);
+}
+
+XXL_INLINE __m256d lasx_setzero_f64() {
+  return (__m256d)__lasx_xvldi(0);
+}
+
+XXL_INLINE __m256 lasx_setzero_f32() {
+  return (__m256)__lasx_xvldi(0);
+}
+
+XXL_INLINE __m256i lasx_setzero_m256i() {
+  return __lasx_xvldi(0);
+}
+
+XXL_INLINE __m256 lasx_set_f32(float a, float b, float c, float d, float e, float f, float g, float h) {
+  return (__m256)(v8f32){h,g,f,e,d,c,b,a};
+}
+
+XXL_INLINE __m256i lasx_set_i32(int a, int b, int c, int d, int e, int f, int g, int h) {
+  return (__m256i)(v8i32){h,g,f,e,d,c,b,a};
+}
+
+XXL_INLINE __m256d lasx_set1_f64(double a) {
+  return (__m256d)(v4f64){a,a,a,a};
+}
+
+XXL_INLINE __m256 lasx_set1_f32(float a) {
+  return (__m256)(v8f32){a,a,a,a,a,a,a,a};
+}
+
+XXL_INLINE __m256i lasx_set1_i8(char a) {
+  return __lasx_xvreplgr2vr_b((int)a);
+}
+
+XXL_INLINE __m256i lasx_set1_i16(short a) {
+  return __lasx_xvreplgr2vr_h((int)a);
+}
+
+XXL_INLINE __m256i lasx_set1_i32(int a) {
+  return __lasx_xvreplgr2vr_w(a);
+}
+
+XXL_INLINE __m256i lasx_set1_i64x(long long a) {
+  return __lasx_xvreplgr2vr_d((long int)a);
+}
+
+XXL_INLINE __m256 lasx_castf64_f32(__m256d a) {
+  return (__m256)a;
+}
+
+XXL_INLINE __m256d lasx_castf32_f64(__m256 a) {
+  return (__m256d)a;
+}
+
+XXL_INLINE __m256i lasx_castf32_m256i(__m256 a) {
+  return (__m256i)a;
+}
+
+XXL_INLINE __m256 lasx_castm256i_f32(__m256i a) {
+  return (__m256)a;
+}
+
+XXL_INLINE __m256i lasx_permute2x128_m256i(__m256i a, __m256i b, const int c) {
+  __m256i _0;
+  __m256i _1 = __lasx_xvldi(0);
+  __m256i _2 = c&136 ? _1 : b;
+  __m256i _3 = c&128 ? (c&2?b:a) : c&0x8 ? (c&32?b:a) : a;
+  int _4 = c&128 ? ((c&1)|32) : c&0x8 ? ((c&16)|2) : c&51;
+  _0 = (c&136)==136 ? _1 : __lasx_xvperm_q(_2, _3, _4);
+  return _0;
+}
+
+XXL_INLINE __m256i lasx_permute4x64_i64(__m256i a, const int b) {
+  return __lasx_xvpermi_d(a, (unsigned int)b);
+}
+
+XXL_INLINE __m256d lasx_permute4x64_f64(__m256d a, const int b) {
+  return (__m256d)__lasx_xvpermi_d((__m256i)a, (unsigned int)b);
+}
+
+XXL_INLINE __m256 lasx_permutevar8x32_f32(__m256 a, __m256i b) {
+  return (__m256)__lasx_xvshuf_w(b, __lasx_xvpermi_q((__m256i)a, (__m256i)a, 17), __lasx_xvpermi_q((__m256i)a, (__m256i)a, 0));
+}
+
+XXL_INLINE __m256i lasx_unpackhi_i8(__m256i a, __m256i b) {
+  return __lasx_xvilvh_b(b, a);
+}
+
+XXL_INLINE __m256i lasx_unpackhi_i16(__m256i a, __m256i b) {
+  return __lasx_xvilvh_h(b, a);
+}
+
+XXL_INLINE __m256i lasx_unpackhi_i32(__m256i a, __m256i b) {
+  return __lasx_xvilvh_w(b, a);
+}
+
+XXL_INLINE __m256i lasx_unpacklo_i8(__m256i a, __m256i b) {
+  return __lasx_xvilvl_b(b, a);
+}
+
+XXL_INLINE __m256i lasx_unpacklo_i16(__m256i a, __m256i b) {
+  return __lasx_xvilvl_h(b, a);
+}
+
+XXL_INLINE __m256i lasx_unpacklo_i32(__m256i a, __m256i b) {
+  return __lasx_xvilvl_w(b, a);
+}
+
+XXL_INLINE __m256i lasx_max_i8(__m256i a, __m256i b) {
+  return __lasx_xvmax_b(a, b);
+}
+
+XXL_INLINE __m256i lasx_max_i32(__m256i a, __m256i b) {
+  return __lasx_xvmax_w(a, b);
+}
+
+XXL_INLINE __m256i lasx_min_i32(__m256i a, __m256i b) {
+  return __lasx_xvmin_w(a, b);
+}
+
+XXL_INLINE __m256i lasx_add_i32(__m256i a, __m256i b) {
+  return __lasx_xvadd_w(a, b);
+}
+
+XXL_INLINE __m256i lasx_add_i64(__m256i a, __m256i b) {
+  return __lasx_xvadd_d(a, b);
+}
+
+XXL_INLINE __m256i lasx_adds_i16(__m256i a, __m256i b) {
+  return __lasx_xvsadd_h(a, b);
+}
+
+XXL_INLINE __m256i lasx_hadd_i32(__m256i a, __m256i b) {
+  return __lasx_xvadd_w(__lasx_xvpickev_w(b, a), __lasx_xvpickod_w(b, a));
+}
+
+XXL_INLINE __m256i lasx_madd_i16(__m256i a, __m256i b) {
+  return __lasx_xvadd_w(__lasx_xvmulwev_w_h(a, b), __lasx_xvmulwod_w_h(a, b));
+}
+
+XXL_INLINE __m256i lasx_maddubs_i16(__m256i a, __m256i b) {
+  return __lasx_xvsadd_h(__lasx_xvmulwev_h_bu_b(a, b), __lasx_xvmulwod_h_bu_b(a, b));
+}
+
+XXL_INLINE __m256i lasx_mullo_i32(__m256i a, __m256i b) {
+  return __lasx_xvmul_w(a, b);
+}
+
+XXL_INLINE __m256i lasx_sub_i32(__m256i a, __m256i b) {
+  return __lasx_xvsub_w(a, b);
+}
+
+XXL_INLINE __m256i lasx_sub_i64(__m256i a, __m256i b) {
+  return __lasx_xvsub_d(a, b);
+}
+
+XXL_INLINE __m256i lasx_packs_i16(__m256i a, __m256i b) {
+  return __lasx_xvpickev_b(__lasx_xvsat_h(b, 7), __lasx_xvsat_h(a, 7));
+}
+
+XXL_INLINE __m256i lasx_packs_i32(__m256i a, __m256i b) {
+  return __lasx_xvpickev_h(__lasx_xvsat_w(b, 15), __lasx_xvsat_w(a, 15));
+}
+
+XXL_INLINE __m256i lasx_packus_i16(__m256i a, __m256i b) {
+  return __lasx_xvpickev_b(__lasx_xvsat_hu(__lasx_xvmax_h(b, __lasx_xvldi(0)), 7), __lasx_xvsat_hu(__lasx_xvmax_h(a, __lasx_xvldi(0)), 7));
+}
+
+XXL_INLINE __m256i lasx_and_m256i(__m256i a, __m256i b) {
+  return __lasx_xvand_v(a, b);
+}
+
+XXL_INLINE __m256i lasx_andnot_m256i(__m256i a, __m256i b) {
+  return __lasx_xvandn_v(a, b);
+}
+
+XXL_INLINE __m256i lasx_or_m256i(__m256i a, __m256i b) {
+  return __lasx_xvor_v(a, b);
+}
+
+XXL_INLINE __m256i lasx_cmpeq_i32(__m256i a, __m256i b) {
+  return __lasx_xvseq_w(a, b);
+}
+
+XXL_INLINE __m256i lasx_maskload_i32(int const* a, __m256i b) {
+  __m256i _0;
+  _0 = __lasx_xvldi(0);
+  __m256i vmask = __lasx_xvmskltz_w(b);
+  int mask = __lasx_xvpickve2gr_w(vmask, 0) | (__lasx_xvpickve2gr_w(vmask, 4) << 4);
+  MASKCOPY(&_0, a, mask, 8, int);
+  return _0;
+}
+
+XXL_INLINE __m256i lasx_maskload_i64(long long const* a, __m256i b) {
+  __m256i _0;
+  _0 = __lasx_xvldi(0);
+  __m256i vmask = __lasx_xvmskltz_d(b);
+  int mask = __lasx_xvpickve2gr_w(vmask, 0) | (__lasx_xvpickve2gr_w(vmask, 4) << 2);
+  MASKCOPY(&_0, a, mask, 4, long long int);
+  return _0;
+}
+
+XXL_INLINE void lasx_maskstore_i32(int* a, __m256i b, __m256i c) {
+  __m256i vmask = __lasx_xvmskltz_w(b);
+  int mask = __lasx_xvpickve2gr_w(vmask, 0) | (__lasx_xvpickve2gr_w(vmask, 4) << 4);
+  MASKCOPY(a, &c, mask, 8, int);
+}
+
+XXL_INLINE __m256i lasx_slli_i32(__m256i a, int b) {
+  return b>=32?__lasx_xvldi(0):__lasx_xvslli_w(a, (unsigned int)b);
+}
+
+XXL_INLINE __m256i lasx_srli_i32(__m256i a, int b) {
+  return b>=32?__lasx_xvldi(0):__lasx_xvsrli_w(a, (unsigned int)b);
+}
+
+XXL_INLINE __m256 lasx_fmadd_f32(__m256 a, __m256 b, __m256 c) {
+  return __lasx_xvfmadd_s(a, b, c);
+}
+
+#endif // __loongarch_asx
+
+#ifdef __loongarch_sx
+XXL_INLINE __m128 lsx_cmp_f32(__m128 a, __m128 b, const int c) {
+  return (__m128)__lsx_vfcmp_xxx_s(a, b, c);
+}
+
+XXL_INLINE __m128 lsx_vfcmp_slt_s(__m128 a, __m128 b) {
+  return (__m128)__lsx_vfcmp_slt_s(a, b);
+}
+
+XXL_INLINE void lsx_maskstore_f32(float * a, __m128i b, __m128 c) {
+  int mask = __lsx_vpickve2gr_w(__lsx_vmskltz_w(b), 0);
+  MASKCOPY(a, &c, mask, 4, float);
+}
+
+XXL_INLINE __m128 lsx_fmadd_f32(__m128 a, __m128 b, __m128 c) {
+  return __lsx_vfmadd_s(a, b, c);
+}
+
+XXL_INLINE __m128 lsx_shuffle_f32(__m128 a, __m128 b, unsigned int c) {
+  return (__m128)__lsx_vpermi_w((__m128i)b, (__m128i)a, c);
+}
+
+XXL_INLINE __m128 lsx_unpackhi_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vilvh_w((__m128i)b, (__m128i)a);
+}
+
+XXL_INLINE __m128 lsx_unpacklo_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vilvl_w((__m128i)b, (__m128i)a);
+}
+
+XXL_INLINE __m128 lsx_min_f32(__m128 a, __m128 b) {
+  return __lsx_vfmin_s(a, b);
+}
+
+XXL_INLINE __m128 lsx_max_f32(__m128 a, __m128 b) {
+  return __lsx_vfmax_s(a, b);
+}
+
+XXL_INLINE __m128 lsx_add_f32(__m128 a, __m128 b) {
+  return __lsx_vfadd_s(a, b);
+}
+
+XXL_INLINE __m128 lsx_sub_f32(__m128 a, __m128 b) {
+  return __lsx_vfsub_s(a, b);
+}
+
+XXL_INLINE __m128 lsx_mul_f32(__m128 a, __m128 b) {
+  return __lsx_vfmul_s(a, b);
+}
+
+XXL_INLINE __m128 lsx_div_f32(__m128 a, __m128 b) {
+  return __lsx_vfdiv_s(a, b);
+}
+
+XXL_INLINE void lsx_storel_pi(__m64* a, __m128 b) {
+  return __lsx_vstelm_d((__m128i)b, (void *)a, 0, 0);
+}
+
+XXL_INLINE void lsx_store_1f32(float* a, __m128 b) {
+  return __lsx_vstelm_w((__m128i)b, (void *)a, 0, 0);
+}
+
+XXL_INLINE void lsx_storeu_f32(float* a, __m128 b) {
+  return __lsx_vst((__m128i)b, (void *)a, 0);
+}
+
+XXL_INLINE __m128 lsx_sqrt_f32(__m128 a) {
+  return __lsx_vfsqrt_s(a);
+}
+
+XXL_INLINE __m128 lsx_and_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vand_v((__m128i)a, (__m128i)b);
+}
+
+XXL_INLINE __m128 lsx_cmplt_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vfcmp_clt_s(a, b);
+}
+
+XXL_INLINE __m128 lsx_cmple_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vfcmp_cle_s(a, b);
+}
+
+XXL_INLINE __m128 lsx_cmpgt_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vfcmp_clt_s(b, a);
+}
+
+XXL_INLINE __m128 lsx_cmpge_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vfcmp_cle_s(b, a);
+}
+
+XXL_INLINE __m128 lsx_set1_f32(float a) {
+  return (__m128)(v4f32){a,a,a,a};
+}
+
+XXL_INLINE __m128 lsx_setzero_f32() {
+  return (__m128)__lsx_vldi(0);
+}
+
+XXL_INLINE __m128 lsx_loadl_pi(__m128 a, __m64 const* b) {
+  __m128 _0;
+  _0 = (__m128)a; __builtin_memcpy(&_0, b, sizeof(__m64));
+  return _0;
+}
+
+XXL_INLINE __m128 lsx_load1_f32(float const* a) {
+  return (__m128)__lsx_vldrepl_w((void *)a, 0);
+}
+
+XXL_INLINE __m128 lsx_loadu_f32(float const* a) {
+  return (__m128)__lsx_vld((void *)a, 0);
+}
+
+XXL_INLINE __m128 lsx_movehl_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vilvh_d((__m128i)a, (__m128i)b);
+}
+
+XXL_INLINE __m128 lsx_movelh_f32(__m128 a, __m128 b) {
+  return (__m128)__lsx_vilvl_d((__m128i)b, (__m128i)a);
+}
+
+XXL_INLINE __m128i lsx_loadl_i64(__m128i const* a) {
+  __m128i _0;
+  _0 = __lsx_vinsgr2vr_d(__lsx_vldi(0), *(long*)(a), 0);
+  return _0;
+}
+
+XXL_INLINE __m128i lsx_loadu_m128i(__m128i const* a) {
+  return __lsx_vld((void *)a, 0);
+}
+
+XXL_INLINE __m128d lsx_load1_f64(double const* a) {
+  return (__m128d)__lsx_vldrepl_d((void *)a, 0);
+}
+
+XXL_INLINE __m128d lsx_loadu_f64(double const* a) {
+  return (__m128d)__lsx_vld((void *)a, 0);
+}
+
+XXL_INLINE void lsx_storeu_m128i(__m128i* a, __m128i b) {
+  return __lsx_vst(b, (void *)a, 0);
+}
+
+XXL_INLINE void lsx_storel_i64(__m128i* a, __m128i b) {
+  return __lsx_vstelm_d(b, (void *)a, 0, 0);
+}
+
+XXL_INLINE void lsx_store_sd(double* a, __m128d b) {
+  __builtin_memcpy(a, &b, sizeof(double));
+}
+
+XXL_INLINE void lsx_storeu_f64(double* a, __m128d b) {
+  return __lsx_vst((__m128i)b, (void *)a, 0);
+}
+
+XXL_INLINE __m128i lsx_add_i32(__m128i a, __m128i b) {
+  return __lsx_vadd_w(a, b);
+}
+
+XXL_INLINE __m128i lsx_add_i64(__m128i a, __m128i b) {
+  return __lsx_vadd_d(a, b);
+}
+
+XXL_INLINE __m128i lsx_madd_i16(__m128i a, __m128i b) {
+  return __lsx_vadd_w(__lsx_vmulwev_w_h(a, b), __lsx_vmulwod_w_h(a, b));
+}
+
+XXL_INLINE __m128i lsx_sub_i32(__m128i a, __m128i b) {
+  return __lsx_vsub_w(a, b);
+}
+
+XXL_INLINE __m128i lsx_sub_i64(__m128i a, __m128i b) {
+  return __lsx_vsub_d(a, b);
+}
+
+XXL_INLINE __m128d lsx_add_f64(__m128d a, __m128d b) {
+  return __lsx_vfadd_d(a, b);
+}
+
+XXL_INLINE __m128d lsx_div_f64(__m128d a, __m128d b) {
+  return __lsx_vfdiv_d(a, b);
+}
+
+XXL_INLINE __m128d lsx_mul_f64(__m128d a, __m128d b) {
+  return __lsx_vfmul_d(a, b);
+}
+
+XXL_INLINE __m128d lsx_sub_f64(__m128d a, __m128d b) {
+  return __lsx_vfsub_d(a, b);
+}
+
+XXL_INLINE __m128d lsx_max_f64(__m128d a, __m128d b) {
+  return __lsx_vfmax_d(a, b);
+}
+
+XXL_INLINE __m128i lsx_srli_m128i(__m128i a, int b) {
+  return b>=16?__lsx_vldi(0):__lsx_vbsrl_v(a, (unsigned int)b);
+}
+
+XXL_INLINE __m128 lsx_cvti32_f32(__m128i a) {
+  return __lsx_vffint_s_w(a);
+}
+
+XXL_INLINE __m128i lsx_cvtf32_i32(__m128 a) {
+  return __lsx_vftint_w_s(a);
+}
+
+XXL_INLINE __m128i lsx_set_i8(char a, char b, char c, char d, char e, char f, char g, char h, char i, char j, char k, char l, char m, char n, char o, char p) {
+  return (__m128i)(v16i8){p,o,n,m,l,k,j,i,h,g,f,e,d,c,b,a};
+}
+
+XXL_INLINE __m128i lsx_set1_i64(__m64 a) {
+  return __lsx_vreplgr2vr_d((long int)a);
+}
+
+XXL_INLINE __m128i lsx_set1_i64x(long long a) {
+  return __lsx_vreplgr2vr_d((long int)a);
+}
+
+XXL_INLINE __m128i lsx_set1_i32(int a) {
+  return __lsx_vreplgr2vr_w(a);
+}
+
+XXL_INLINE __m128i lsx_set1_i16(short a) {
+  return __lsx_vreplgr2vr_h((int)a);
+}
+
+XXL_INLINE __m128i lsx_set1_i8(char a) {
+  return __lsx_vreplgr2vr_b((int)a);
+}
+
+XXL_INLINE __m128i lsx_setr_i32(int a, int b, int c, int d) {
+  return (__m128i)(v4i32){a,b,c,d};
+}
+
+XXL_INLINE __m128i lsx_setzero_m128i() {
+  return __lsx_vldi(0);
+}
+
+XXL_INLINE __m128d lsx_set1_f64(double a) {
+  return (__m128d)(v2f64){a,a};
+}
+
+XXL_INLINE __m128d lsx_setzero_f64() {
+  return (__m128d)__lsx_vldi(0);
+}
+
+XXL_INLINE __m128i lsx_packs_i16(__m128i a, __m128i b) {
+  return __lsx_vpickev_b(__lsx_vsat_h(b, 7), __lsx_vsat_h(a, 7));
+}
+
+XXL_INLINE __m128i lsx_packs_i32(__m128i a, __m128i b) {
+  return __lsx_vpickev_h(__lsx_vsat_w(b, 15), __lsx_vsat_w(a, 15));
+}
+
+XXL_INLINE __m128i lsx_packus_i16(__m128i a, __m128i b) {
+  return __lsx_vpickev_b(__lsx_vsat_hu(__lsx_vmax_h(b, __lsx_vldi(0)), 7), __lsx_vsat_hu(__lsx_vmax_h(a, __lsx_vldi(0)), 7));
+}
+
+XXL_INLINE __m128i lsx_shuffle_i32(__m128i a, int b) {
+  return __lsx_vshuf4i_w(a, (unsigned int)b);
+}
+
+XXL_INLINE __m128i lsx_unpackhi_i8(__m128i a, __m128i b) {
+  return __lsx_vilvh_b(b, a);
+}
+
+XXL_INLINE __m128i lsx_unpackhi_i16(__m128i a, __m128i b) {
+  return __lsx_vilvh_h(b, a);
+}
+
+XXL_INLINE __m128i lsx_unpackhi_i32(__m128i a, __m128i b) {
+  return __lsx_vilvh_w(b, a);
+}
+
+XXL_INLINE __m128i lsx_unpacklo_i8(__m128i a, __m128i b) {
+  return __lsx_vilvl_b(b, a);
+}
+
+XXL_INLINE __m128i lsx_unpacklo_i16(__m128i a, __m128i b) {
+  return __lsx_vilvl_h(b, a);
+}
+
+XXL_INLINE __m128i lsx_unpacklo_i32(__m128i a, __m128i b) {
+  return __lsx_vilvl_w(b, a);
+}
+
+XXL_INLINE __m128d lsx_sqrt_f64(__m128d a) {
+  return __lsx_vfsqrt_d(a);
+}
+
+XXL_INLINE __m128i lsx_castf32_m128i(__m128 a) {
+  return (__m128i)a;
+}
+
+XXL_INLINE __m128 lsx_castm128i_f32(__m128i a) {
+  return (__m128)a;
+}
+
+XXL_INLINE __m128d lsx_hadd_f64(__m128d a, __m128d b) {
+  return __lsx_vfadd_d((__m128d)__lsx_vpickev_d((__m128i)b, (__m128i)a), (__m128d)__lsx_vpickod_d((__m128i)b, (__m128i)a));
+}
+
+XXL_INLINE __m128 lsx_hadd_f32(__m128 a, __m128 b) {
+  return __lsx_vfadd_s((__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a), (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a));
+}
+
+XXL_INLINE __m128 lsx_blend_f32(__m128 a, __m128 b, const int c) {
+  return (__m128)__lsx_vbitsel_v((__m128i)a, (__m128i)b, __lsx_vilvl_h(__lsx_vldi((c&1)|(c&1)<<1|(c&2)<<1|(c&2)<<2|(c&4)<<2|(c&4)<<3|(c&8)<<3|(c&8)<<4|0xf900), __lsx_vldi((c&1)|(c&1)<<1|(c&2)<<1|(c&2)<<2|(c&4)<<2|(c&4)<<3|(c&8)<<3|(c&8)<<4|0xf900)));
+}
+
+XXL_INLINE __m128 lsx_blendv_f32(__m128 a, __m128 b, __m128 c) {
+  return (__m128)__lsx_vbitsel_v((__m128i)a, (__m128i)b, __lsx_vslti_w((__m128i)c, 0));
+}
+
+XXL_INLINE int lsx_extract_i32(__m128i a, const int b) {
+  return (int)__lsx_vpickve2gr_wu(a, (unsigned int)b);
+}
+
+XXL_INLINE __m128i lsx_mullo_i32(__m128i a, __m128i b) {
+  return __lsx_vmul_w(a, b);
+}
+
+XXL_INLINE __m128i lsx_max_i32(__m128i a, __m128i b) {
+  return __lsx_vmax_w(a, b);
+}
+
+XXL_INLINE __m128i lsx_min_i32(__m128i a, __m128i b) {
+  return __lsx_vmin_w(a, b);
+}
+
+XXL_INLINE __m128i lsx_cvti8_i32(__m128i a) {
+  return __lsx_vsllwil_w_h(__lsx_vsllwil_h_b(a, 0), 0);
+}
+
+XXL_INLINE __m128i lsx_shuffle_i8(__m128i a, __m128i b) {
+  return __lsx_vand_v(__lsx_vshuf_b(a, a, b), __lsx_vxori_b(__lsx_vslti_b(b, 0), 255));
+}
+
+XXL_INLINE __m128i lsx_hadd_i32(__m128i a, __m128i b) {
+  return __lsx_vadd_w(__lsx_vpickev_w(b, a), __lsx_vpickod_w(b, a));
+}
+
+XXL_INLINE __m128i lsx_maddubs_i16(__m128i a, __m128i b) {
+  return __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(a, b), __lsx_vmulwod_h_bu_b(a, b));
+}
+
+#endif // __loongarch_sx
diff --git a/lite/core/context.h b/lite/core/context.h
index dcbd7162b38..dec693ed99f 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -54,6 +54,7 @@ class Context;
 
 using HostContext = Context<TargetType::kHost>;
 using X86Context = Context<TargetType::kX86>;
+using LoongArchContext = Context<TargetType::kLoongArch>;
 using ARMContext = Context<TargetType::kARM>;
 using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
@@ -361,6 +362,24 @@ class Context<TargetType::kX86> {
 };
 #endif
 
+#ifdef LITE_WITH_LOONGARCH
+template <>
+class Context<TargetType::kLoongArch> {
+ public:
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+
+  void CopySharedTo(LoongArchContext* ctx) {}
+
+  std::string name() const { return "LoongArchContext"; }
+
+ private:
+  // overall information
+  //
+  // kernel information
+};
+#endif
+
 #ifdef LITE_WITH_OPENCL
 template <>
 class Context<TargetType::kOpenCL> {
@@ -465,6 +484,12 @@ class ContextScheduler {
             &ctx->As<ARMContext>());
         break;
 #endif
+#ifdef LITE_WITH_LOONGARCH
+      case TARGET(kLoongArch):
+        kernel_contexts_[TargetType::kLoongArch].As<LoongArchContext>().CopySharedTo(
+            &ctx->As<LoongArchContext>());
+        break;
+#endif
 #ifdef LITE_WITH_XPU
       case TARGET(kXPU):
         kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
@@ -514,6 +539,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_ARM
     InitContext<TargetType::kARM, ARMContext>();
 #endif
+#ifdef LITE_WITH_LOONGARCH
+    InitContext<TargetType::kLoongArch, LoongArchContext>();
+#endif
 #ifdef LITE_WITH_OPENCL
     VLOG(4) << "ContextScheduler init opencl context ";
     InitContext<TargetType::kOpenCL, OpenCLContext>();
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index 1a0f676e380..143639da01a 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -78,6 +78,10 @@
 #endif
 #endif
 
+#ifdef WITH_OMP
+#include <omp.h>
+#endif
+
 #include <algorithm>
 #include <limits>
 
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index ee82a1f4d0a..7d9bf73d3bb 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -97,6 +97,9 @@ class KernelBase {
 #if defined(LITE_WITH_X86)
     WorkSpace::Global_X86().AllocReset();
 #endif
+#if defined(LITE_WITH_LOONGARCH)
+    WorkSpace::Global_LOONGARCH().AllocReset();
+#endif
 #if defined(LITE_WITH_METAL)
     WorkSpace::Global_METAL().AllocReset();
 #endif
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index 1c5dd1b8095..c0186d93418 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -32,6 +32,7 @@ void* TargetMalloc(TargetType target, size_t size) {
       case TargetType::kHost:
       case TargetType::kX86:
       case TargetType::kARM:
+      case TargetType::kLoongArch:
         data = TargetWrapper<TARGET(kHost)>::Malloc(size);
         break;
 #ifdef LITE_WITH_OPENCL
@@ -65,6 +66,7 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
       case TargetType::kHost:
       case TargetType::kX86:
       case TargetType::kARM:
+      case TargetType::kLoongArch:
         TargetWrapper<TARGET(kHost)>::Free(data);
         break;
 
@@ -102,6 +104,7 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
     case TargetType::kHost:
     case TargetType::kX86:
     case TargetType::kARM:
+    case TargetType::kLoongArch:
       TargetWrapper<TARGET(kHost)>::MemcpySync(
           dst, src, size, IoDirection::DtoD);
       break;
diff --git a/lite/core/memory.h b/lite/core/memory.h
index 5741dd8bdcb..100aaae3446 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -68,6 +68,7 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
     case TARGET(kX86):
     case TARGET(kHost):
     case TARGET(kARM):
+    case TARGET(kLoongArch):
       TargetWrapper<TARGET(kHost)>::MemcpySync(
           dst, src, size, IoDirection::HtoH);
       break;
diff --git a/lite/core/optimizer/mir/fusion/conv_conv_fuse_pass.cc b/lite/core/optimizer/mir/fusion/conv_conv_fuse_pass.cc
index d24beeae99f..56177230a65 100644
--- a/lite/core/optimizer/mir/fusion/conv_conv_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/conv_conv_fuse_pass.cc
@@ -32,7 +32,7 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   bool has_weight_quant = false;
   for (auto& place : graph->valid_places()) {
     if (place.target == TARGET(kARM) || place.target == TARGET(kHost) ||
-        place.target == TARGET(kOpenCL) || place.target == TARGET(kX86)) {
+        place.target == TARGET(kOpenCL) || place.target == TARGET(kX86) || place.target == TARGET(kLoongArch)) {
       if (place.precision == PRECISION(kInt8)) {
         has_int8 = true;
       }
@@ -78,4 +78,4 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }  // namespace paddle
 
 REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
-    .BindTargets({TARGET(kARM), TARGET(kOpenCL), TARGET(kX86)});
+    .BindTargets({TARGET(kARM), TARGET(kOpenCL), TARGET(kX86), TARGET(kLoongArch)});
diff --git a/lite/core/optimizer/mir/fusion/conv_scale_fuse_pass.cc b/lite/core/optimizer/mir/fusion/conv_scale_fuse_pass.cc
index 8ec085ca833..fb2268c4f73 100644
--- a/lite/core/optimizer/mir/fusion/conv_scale_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/conv_scale_fuse_pass.cc
@@ -32,7 +32,7 @@ void ConvScaleFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   bool has_weight_quant = false;
   for (auto& place : graph->valid_places()) {
     if (place.target == TARGET(kARM) || place.target == TARGET(kHost) ||
-        place.target == TARGET(kX86)) {
+        place.target == TARGET(kX86) || place.target == TARGET(kLoongArch)) {
       if (place.precision == PRECISION(kInt8)) {
         has_int8 = true;
       }
@@ -73,4 +73,4 @@ void ConvScaleFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_conv_scale_fuse_pass,
                   paddle::lite::mir::ConvScaleFusePass)
-    .BindTargets({TARGET(kOpenCL), TARGET(kARM), TARGET(kX86)});
+    .BindTargets({TARGET(kOpenCL), TARGET(kARM), TARGET(kX86), TARGET(kLoongArch)});
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index d456ee4ec91..e5c95d4a48f 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -33,6 +33,9 @@
 #ifdef LITE_WITH_X86
 #include "lite/backends/x86/fluid/float16.h"
 #endif
+#ifdef LITE_WITH_LOONGARCH
+#include "lite/backends/loongarch/fluid/float16.h"
+#endif
 
 #ifdef LITE_WITH_OPENCL
 #include "lite/backends/opencl/cl_image_converter.h"
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 7267a3ed010..8256841d370 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -556,6 +556,8 @@ RuntimeProgram::RuntimeProgram(
       kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
 #elif defined(LITE_WITH_X86)
       kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
+#elif defined(LITE_WITH_LOONGARCH)
+      kernels = op->CreateKernels({Place{TARGET(kLoongArch)}, Place{TARGET(kHost)}});
 #endif
       if (kernels.size() > 0) {
         kernel = std::move(kernels.front());
diff --git a/lite/core/scope.h b/lite/core/scope.h
index 9d915ceeb5e..4cd0849786f 100644
--- a/lite/core/scope.h
+++ b/lite/core/scope.h
@@ -19,7 +19,11 @@
 #include <string>
 #include <utility>
 #include <vector>
+#if defined(LITE_WITH_X86)
 #include "lite/backends/x86/fluid/rw_lock.h"
+#elif defined(LITE_WITH_LOONGARCH)
+#include "lite/backends/loongarch/fluid/rw_lock.h"
+#endif
 #include "lite/core/variable.h"
 
 namespace paddle {
diff --git a/lite/core/type_system.h b/lite/core/type_system.h
index 2a7dc924e68..d5cd654d2cc 100644
--- a/lite/core/type_system.h
+++ b/lite/core/type_system.h
@@ -176,7 +176,7 @@ class Type : public DataType {
 // -------------------------------- compatible check ---------------------------
 static bool TargetCompatibleTo(const Type& a, const Type& b) {
   auto is_host = [](TargetType x) -> bool {
-    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM) ||
+    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM) || x == TARGET(kLoongArch) ||
            x == TARGET(kAny);
   };
   if (a.IsTensor() || b.IsTensor() || a.IsTensorList() || b.IsTensorList()) {
diff --git a/lite/core/workspace.h b/lite/core/workspace.h
index 233c9f9055d..d3d8a39c900 100644
--- a/lite/core/workspace.h
+++ b/lite/core/workspace.h
@@ -63,6 +63,10 @@ class WorkSpace {
   static WorkSpace& Global_ARM() { return Global_Host(); }
 #endif
 
+#if defined(LITE_WITH_LOONGARCH)
+  static WorkSpace& Global_LOONGARCH() { return Global_Host(); }
+#endif
+
 #if defined(LITE_WITH_METAL)
   static WorkSpace& Global_METAL() {
     static LITE_THREAD_LOCAL std::unique_ptr<WorkSpace> x(
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index cd3f674aed6..c13af083817 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -90,6 +90,7 @@ endif()
 add_subdirectory(host)
 add_subdirectory(arm)
 add_subdirectory(x86)
+add_subdirectory(loongarch)
 add_subdirectory(opencl)
 add_subdirectory(xpu)
 add_subdirectory(metal)
diff --git a/lite/kernels/host/while_compute.cc b/lite/kernels/host/while_compute.cc
index 21ee8e8da99..2f6a7a94e8e 100644
--- a/lite/kernels/host/while_compute.cc
+++ b/lite/kernels/host/while_compute.cc
@@ -27,7 +27,7 @@ namespace host {
 
 bool GetCondData(const Tensor *cond) {
   auto is_host = [](TargetType x) -> bool {
-    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
+    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM) || x == TARGET(kLoongArch);
   };
 
   bool flag;
diff --git a/lite/kernels/host/write_back_compute.cc b/lite/kernels/host/write_back_compute.cc
index 4579075b2df..b534ca8f374 100644
--- a/lite/kernels/host/write_back_compute.cc
+++ b/lite/kernels/host/write_back_compute.cc
@@ -30,7 +30,7 @@ void WriteBackCompute::RunImplement(const lite::Tensor* x,
   auto x_target = x->target();
   auto y_target = y->target();
   auto is_host = [](TargetType x) -> bool {
-    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
+    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM) || x == TARGET(kLoongArch);
   };
 
   if (is_host(x_target) && is_host(y_target)) {
diff --git a/lite/kernels/loongarch/CMakeLists.txt b/lite/kernels/loongarch/CMakeLists.txt
new file mode 100755
index 00000000000..b078bfa9097
--- /dev/null
+++ b/lite/kernels/loongarch/CMakeLists.txt
@@ -0,0 +1,91 @@
+if(LITE_WITH_LOONGARCH AND NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
+  set(IS_FAKED_KERNEL false CACHE INTERNAL "")
+  set(lite_kernel_deps ${lite_kernel_deps} loongarch_math CACHE INTERNAL "")
+elseif(LITE_WITH_PYTHON OR LITE_ON_MODEL_OPTIMIZE_TOOL)
+  set(IS_FAKED_KERNEL true CACHE INTERNAL "")
+else()
+  return()
+endif()
+
+
+add_kernel(activation_compute_loongarch loongarch basic SRCS activation_compute.cc)
+add_kernel(scale_compute_loongarch loongarch basic SRCS scale_compute.cc)
+#add_kernel(cast_compute_loongarch loongarch basic SRCS cast_compute.cc)
+add_kernel(slice_compute_loongarch loongarch basic SRCS slice_compute.cc)
+add_kernel(conv_depthwise_loongarch loongarch basic SRCS conv_depthwise.cc)
+add_kernel(conv_compute_loongarch loongarch basic SRCS conv_compute.cc)
+#add_kernel(conv_direct_loongarch loongarch basic SRCS conv_direct.cc)
+add_kernel(instance_norm_compute_loongarch loongarch basic SRCS instance_norm_compute.cc)
+add_kernel(group_norm_compute_loongarch loongarch basic SRCS group_norm_compute.cc)
+add_kernel(calib_compute_loongarch loongarch basic SRCS calib_compute.cc)
+add_kernel(pool_compute_loongarch loongarch basic SRCS pool_compute.cc)
+add_kernel(stack_compute_loongarch loongarch basic SRCS stack_compute.cc)
+add_kernel(dropout_compute_loongarch loongarch basic SRCS dropout_compute.cc)
+add_kernel(transpose_compute_loongarch loongarch basic SRCS transpose_compute.cc)
+add_kernel(layer_norm_compute_loongarch loongarch basic SRCS layer_norm_compute.cc)
+add_kernel(fc_compute_loongarch loongarch basic SRCS fc_compute.cc)
+add_kernel(gru_compute_loongarch loongarch basic SRCS gru_compute.cc)
+add_kernel(gru_unit_compute_loongarch loongarch basic SRCS gru_unit_compute.cc)
+add_kernel(sequence_expand_as_compute_loongarch loongarch basic SRCS sequence_expand_as_compute.cc)
+add_kernel(sequence_conv_compute_loongarch loongarch basic SRCS sequence_conv_compute.cc)
+
+add_kernel(gather_compute_loongarch loongarch extra SRCS gather_compute.cc)
+add_kernel(grid_sampler_compute_loongarch loongarch extra SRCS grid_sampler_compute.cc)
+add_kernel(clip_compute_loongarch loongarch extra SRCS clip_compute.cc)
+add_kernel(mul_compute_loongarch loongarch basic SRCS mul_compute.cc)
+add_kernel(concat_compute_loongarch loongarch basic SRCS concat_compute.cc)
+add_kernel(sequence_pool_compute_loongarch loongarch basic SRCS sequence_pool_compute.cc)
+add_kernel(search_group_padding_compute_loongarch loongarch basic SRCS search_group_padding_compute.cc)
+add_kernel(sequence_reverse_compute_loongarch loongarch basic SRCS sequence_reverse_compute.cc)
+add_kernel(softmax_compute_loongarch loongarch basic SRCS softmax_compute.cc)
+add_kernel(elementwise_compute_loongarch loongarch basic SRCS elementwise_compute.cc)
+add_kernel(batch_norm_compute_loongarch loongarch basic SRCS batch_norm_compute.cc)
+add_kernel(reduce_compute_loongarch loongarch basic SRCS reduce_compute.cc)
+add_kernel(lookup_table_compute_loongarch loongarch basic SRCS lookup_table_compute.cc)
+add_kernel(sequence_reshape_compute_loongarch loongarch basic SRCS sequence_reshape_compute.cc)
+add_kernel(match_matrix_tensor_compute_loongarch loongarch basic SRCS match_matrix_tensor_compute.cc)
+add_kernel(search_seq_depadding_compute_loongarch loongarch basic SRCS search_seq_depadding_compute.cc)
+add_kernel(search_grnn_compute_loongarch loongarch basic SRCS search_grnn_compute.cc)
+add_kernel(sequence_concat_compute_loongarch loongarch basic SRCS sequence_concat_compute.cc)
+add_kernel(var_conv_2d_compute_loongarch loongarch basic SRCS var_conv_2d_compute.cc)
+add_kernel(attention_padding_mask_compute_loongarch loongarch basic SRCS attention_padding_mask_compute.cc)
+add_kernel(sequence_arithmetic_compute_loongarch loongarch basic SRCS sequence_arithmetic_compute.cc)
+
+# for content-dnn specific
+add_kernel(search_aligned_mat_mul_compute_loongarch loongarch extra SRCS search_aligned_mat_mul_compute.cc)
+add_kernel(search_seq_fc_compute_loongarch loongarch extra SRCS search_seq_fc_compute.cc)
+add_kernel(sequence_topk_avg_pooling_compute_loongarch loongarch basic SRCS sequence_topk_avg_pooling_compute.cc)
+
+add_kernel(matmul_compute_loongarch loongarch basic SRCS matmul_compute.cc)
+add_kernel(matmul_v2_compute_loongarch loongarch basic SRCS matmul_v2_compute.cc)
+add_kernel(box_coder_compute_loongarch loongarch basic SRCS box_coder_compute.cc)
+add_kernel(density_prior_box_compute_loongarch loongarch basic SRCS density_prior_box_compute.cc)
+add_kernel(interpolate_compute_loongarch loongarch basic SRCS interpolate_compute.cc)
+add_kernel(pow_compute_loongarch loongarch extra SRCS pow_compute.cc)
+add_kernel(rnn_compute_loongarch loongarch basic SRCS rnn_compute.cc)
+add_kernel(conv_transpose_loongarch loongarch basic SRCS conv_transpose_compute.cc)
+add_kernel(set_value loongarch basic SRCS set_value_compute.cc)
+
+lite_cc_test(test_conv2d_compute_loongarch SRCS conv_compute_test.cc)
+lite_cc_test(test_mul_compute_loongarch SRCS mul_compute_test.cc)
+lite_cc_test(test_sequence_pool_compute_loongarch SRCS sequence_pool_compute_test.cc)
+lite_cc_test(test_batch_norm_compute_loongarch SRCS batch_norm_compute_test.cc)
+lite_cc_test(test_softmax_compute_loongarch SRCS softmax_compute_test.cc)
+lite_cc_test(test_sequence_expand_as_compute_loongarch SRCS sequence_expand_as_compute_test.cc)
+lite_cc_test(test_gru_compute_loongarch SRCS gru_compute_test.cc)
+lite_cc_test(test_matmul_compute_loongarch SRCS matmul_compute_test.cc)
+#lite_cc_test(test_cast_compute_loongarch SRCS cast_compute_test.cc)
+lite_cc_test(test_pool2d_compute_loongarch SRCS pool_compute_test.cc)
+lite_cc_test(test_layer_norm_compute_loongarch SRCS layer_norm_compute_test.cc)
+lite_cc_test(test_dropout_compute_loongarch SRCS dropout_compute_test.cc)
+lite_cc_test(test_transpose_compute_loongarch SRCS transpose_compute_test.cc)
+# lite_cc_test(test_search_fc_compute_loongarch SRCS search_fc_compute_test.cc)
+lite_cc_test(test_search_seq_depadding_compute_loongarch SRCS search_seq_depadding_compute_test.cc)
+lite_cc_test(test_search_grnn_compute_loongarch SRCS search_grnn_compute_test.cc)
+lite_cc_test(test_match_matrix_compute_loongarch SRCS match_matrix_tensor_compute_test.cc)
+lite_cc_test(test_lookup_table_compute_loongarch SRCS lookup_table_compute_test.cc)
+lite_cc_test(test_search_group_padding_compute_loongarch SRCS search_group_padding_compute_test.cc)
+lite_cc_test(test_sequence_concat_compute_loongarch SRCS sequence_concat_compute_test.cc)
+lite_cc_test(test_var_conv_2d_compute_loongarch SRCS var_conv_2d_compute_test.cc)
+#lite_cc_test(test_attention_padding_mask_compute_loongarch SRCS attention_padding_mask_compute_test.cc)
+lite_cc_test(test_sequence_arithmetic_compute_loongarch SRCS sequence_arithmetic_compute_test.cc)
diff --git a/lite/kernels/loongarch/activation_compute.cc b/lite/kernels/loongarch/activation_compute.cc
new file mode 100644
index 00000000000..cc46799d92d
--- /dev/null
+++ b/lite/kernels/loongarch/activation_compute.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/activation_compute.h"
+
+REGISTER_LITE_KERNEL(square,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SquareCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(relu,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ReluCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(leaky_relu,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::LeakyReluCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindPaddleOpVersion("leaky_relu", 1)
+    .Finalize();
+
+REGISTER_LITE_KERNEL(tanh,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::TanhCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gelu,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::GeluCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(softsign,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SoftsignCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sigmoid,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SigmoidCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(relu6,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::Relu6Compute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sqrt,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SqrtCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(rsqrt,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::RsqrtCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(mish,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::MishCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(hard_swish,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::HardSwishComputeCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(erf,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ErfCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
\ No newline at end of file
diff --git a/lite/kernels/loongarch/activation_compute.h b/lite/kernels/loongarch/activation_compute.h
new file mode 100644
index 00000000000..e83b3980f8b
--- /dev/null
+++ b/lite/kernels/loongarch/activation_compute.h
@@ -0,0 +1,401 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/math/activation.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+enum ActBwdOpFwdDeps {
+  kNoDeps = 0x00,  // Do not need any forward input/output
+  kDepX = 0x01,    // Only need forward input X
+  kDepOut = 0x02,  // Only need forward output Out
+
+  // Never add kDepXOut, because Out can be always calculated
+  // by forward input X in backward part.
+  // FIXME(zjl): but in MKLDNN abs, X and Out are all needed...
+  // Developers should not rely on this enum value!
+  kDepXOut = 0x03
+};
+
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+
+  /* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
+     For example, sigmoid op's gradient didn't involve x, so its output can
+     reuse
+     input memory. But abs op's gradient use x, it can not be inplaced.
+     gradient did use x.
+   */
+  bool Inplace() const { return false; }
+};
+
+template <typename Functor>
+bool Activate(const lite::Tensor* X, lite::Tensor* Out) {
+  using T = typename Functor::ELEMENT_TYPE;
+  auto place = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+  CHECK_OR_FALSE(X)
+  CHECK_OR_FALSE(Out)
+  auto x = lite::fluid::EigenVector<T>::Flatten(*X);
+  auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
+  Functor()(place, x, out);
+  return true;
+}
+
+// square(x) = x^2
+template <typename T>
+struct SquareFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.square();
+  }
+};
+
+template <typename T>
+class SquareCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    Activate<SquareFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~SquareCompute() = default;
+};
+
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+class ReluCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    Activate<ReluFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~ReluCompute() = default;
+};
+
+template <typename T>
+struct LeakyReluFunctor {
+  float alpha;
+  explicit LeakyReluFunctor(float alpha_) : alpha(alpha_) {}
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+  }
+};
+
+template <typename T>
+class LeakyReluCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    auto X = param.X;
+    auto Out = param.Out;
+    auto place = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+    CHECK(X);
+    CHECK(Out);
+    auto x = lite::fluid::EigenVector<T>::Flatten(*X);
+    auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
+    LeakyReluFunctor<T> functor(param.Leaky_relu_alpha);
+    functor(place, x, out);
+  }
+
+  virtual ~LeakyReluCompute() = default;
+};
+
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
+  }
+};
+
+template <typename T>
+class TanhCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    Activate<TanhFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~TanhCompute() = default;
+};
+
+// gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
+template <typename T>
+struct GeluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+// Because the execute or device context can not be deliver here, it keep the
+// marco for NVCC.
+    auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
+    out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+  }
+};
+
+template <typename T>
+class GeluCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    Activate<GeluFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~GeluCompute() = default;
+};
+
+// softsign(x) = x / (1 + |x|)
+template <typename T>
+class SoftsignCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    // auto& context = ctx_->As<LoongArchContext>();
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    const T* x_data = param.X->template data<T>();
+    T* out_data = param.Out->template mutable_data<T>();
+    size_t x_size = param.X->numel();
+    for (size_t i = 0; i < x_size; i++) {
+      out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i]));
+    }
+  }
+
+  virtual ~SoftsignCompute() = default;
+};
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = (static_cast<T>(1) + (-x).exp()).inverse();
+  }
+};
+
+template <typename T>
+class SigmoidCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = this->Param<param_t>();
+    param.Out->template mutable_data<T>();
+    Activate<SigmoidFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~SigmoidCompute() = default;
+};
+
+// relu6(x) = min(max(0, x), 6)
+template <typename T>
+struct Relu6Functor {
+  float threshold;
+  explicit Relu6Functor(float threshold_) : threshold(threshold_) {}
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
+  }
+};
+
+template <typename T>
+class Relu6Compute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    auto X = param.X;
+    auto Out = param.Out;
+    auto place = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+    CHECK(X);
+    CHECK(Out);
+    auto x = lite::fluid::EigenVector<T>::Flatten(*X);
+    auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
+    Relu6Functor<T> functor(param.threshold);
+    functor(place, x, out);
+  }
+
+  virtual ~Relu6Compute() = default;
+};
+
+template <typename T>
+struct SqrtFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.sqrt();
+  }
+};
+
+template <typename T>
+class SqrtCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+    param.Out->template mutable_data<T>();
+    Activate<SqrtFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~SqrtCompute() = default;
+};
+
+template <typename T>
+struct RsqrtFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.rsqrt();
+  }
+};
+
+template <typename T>
+class RsqrtCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+    param.Out->template mutable_data<T>();
+    Activate<RsqrtFunctor<T>>(param.X, param.Out);
+  }
+
+  virtual ~RsqrtCompute() = default;
+};
+
+template <typename T>
+class MishCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+    param.Out->template mutable_data<T>();
+    auto x_dims = param.X->dims();
+    auto x_data = param.X->template data<T>();
+    auto output_data = param.Out->template mutable_data<T>();
+    float threshold = param.threshold;
+    lite::loongarch::math::mish<T>(
+        x_data, output_data, x_dims.production(), threshold);
+  }
+
+  virtual ~MishCompute() = default;
+};
+
+template <typename T>
+class HardSwishComputeCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+    param.Out->template mutable_data<T>();
+    auto x_dims = param.X->dims();
+    auto x_data = param.X->template data<T>();
+    auto output_data = param.Out->template mutable_data<T>();
+    lite::loongarch::math::hard_swish<T>(x_data,
+                                   output_data,
+                                   x_dims.production(),
+                                   param.hard_swish_scale,
+                                   param.hard_swish_offset,
+                                   param.hard_swish_threshold);
+  }
+
+  virtual ~HardSwishComputeCompute() = default;
+};
+
+template <typename T>
+class ErfCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x_dims = param.X->dims();
+    auto x_data = param.X->template data<T>();
+    auto output_data = param.Out->template mutable_data<T>();
+    for (int i = 0; i < x_dims.production(); ++i) {
+      output_data[0] = std::erf(x_data[0]);
+      x_data++;
+      output_data++;
+    }
+  }
+
+  virtual ~ErfCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/attention_padding_mask_compute.cc b/lite/kernels/loongarch/attention_padding_mask_compute.cc
new file mode 100644
index 00000000000..9d2d0988804
--- /dev/null
+++ b/lite/kernels/loongarch/attention_padding_mask_compute.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/attention_padding_mask_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_attention_padding_mask,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::AttentionPaddingMaskCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/attention_padding_mask_compute.h b/lite/kernels/loongarch/attention_padding_mask_compute.h
new file mode 100644
index 00000000000..ec6e428f9c7
--- /dev/null
+++ b/lite/kernels/loongarch/attention_padding_mask_compute.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Eigen/Core>
+#include <random>
+#include <string>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/operators/attention_padding_mask_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class AttentionPaddingMaskCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AttentionPaddingMaskParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto* bottom0 = param.X;
+    auto* bottom1 = param.Y;
+    auto* _pad_begin = param.pad_begin;
+    auto* top = param.Out;
+    int _pad_id = param.pad_id;
+    float _mask = param.mask;
+    auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
+    const int att_batch = bottom0->lod()[0].size() - 1;
+    const int src_batch = bottom1->lod()[0].size() - 1;
+    int* pad_begin = _pad_begin->template mutable_data<int>();
+    for (int i = 0; i < src_batch; ++i) {
+      const auto* src_data = bottom1->template data<T>() + src_len * i;
+      int index = src_len - 1;
+      for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
+           --index) {
+      }
+      pad_begin[i] = index + 1;
+    }
+
+    const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
+    auto* top_data = top->template mutable_data<T>();
+    memcpy(top_data,
+           bottom0->template data<T>(),
+           bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
+    for (int i = 0; i < att_batch; ++i) {
+      for (int j = 0; j < att_len; ++j) {
+        top_data =
+            top->template mutable_data<T>() + src_len * (att_len * i + j);
+        int src_idx = i % src_batch;
+        for (int k = pad_begin[src_idx]; k < src_len; ++k) {
+          top_data[k] = _mask;
+        }
+      }
+    }
+  }
+
+  virtual ~AttentionPaddingMaskCompute() = default;
+
+ private:
+  lite::Tensor src_offset_;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/attention_padding_mask_compute_test.cc b/lite/kernels/loongarch/attention_padding_mask_compute_test.cc
new file mode 100644
index 00000000000..e7593f0fc79
--- /dev/null
+++ b/lite/kernels/loongarch/attention_padding_mask_compute_test.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/attention_padding_mask_compute.cc"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void attention_padding_mask_ref(
+    const Tensor& x,
+    const Tensor& y,
+    Tensor* out,
+    Tensor* pad_begin,
+    const operators::AttentionPaddingMaskParam& param) {
+  auto attn_offset = x.lod()[0];
+  auto src_offset = y.lod()[0];
+  int attn_seq_num = attn_offset.size() - 1;
+  int src_seq_num = src_offset.size() - 1;
+  int attn_seq_len = attn_offset[1];
+  int src_seq_len = x.dims()[1];
+  CHECK_EQ(attn_seq_num % src_seq_num, 0);
+
+  auto count = x.numel();
+  auto attn_data = x.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  memcpy(out_data, attn_data, count * sizeof(float));
+
+  for (int i = 0; i < attn_seq_num; ++i) {
+    for (int j = 0; j < attn_seq_len; ++j) {
+      auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j);
+      int src_seq_idx = i % src_seq_num;
+      int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx];
+      for (int k = cur_len; k < src_seq_len; k++) {
+        tmp_out_data[k] = param.mask;
+      }
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) {
+  std::vector<int64_t> x_dims{static_cast<int64_t>(lod[0].back()), dim2rd};
+  x->Resize(x_dims);
+  x->set_lod(lod);
+  auto x_data = x->mutable_data<float>();
+  auto x_num = x->numel();
+  for (int i = 0; i < x_num; i++) {
+    x_data[i] = (i - x_num) * 1.1;
+  }
+}
+
+int get_max_len(const LoD& lod) {
+  int max_len = 0;
+  auto offset = lod[0];
+  for (int i = 0; i < offset.size() - 1; i++) {
+    int cur_len = offset[i + 1] - offset[i];
+    max_len = max_len < cur_len ? cur_len : max_len;
+  }
+  return max_len;
+}
+
+TEST(attention_padding_mask_loongarch, retrive_op) {
+  auto attention_padding_mask =
+      KernelRegistry::Global().Create("attention_padding_mask");
+  ASSERT_FALSE(attention_padding_mask.empty());
+  ASSERT_TRUE(attention_padding_mask.front());
+}
+
+TEST(attention_padding_mask_loongarch, init) {
+  AttentionPaddingMaskCompute<float> attention_padding_mask;
+  ASSERT_EQ(attention_padding_mask.precision(), PRECISION(kFloat));
+  ASSERT_EQ(attention_padding_mask.target(), TARGET(kLoongArch));
+}
+
+TEST(attention_padding_mask_loongarch, run_test) {
+  lite::Tensor x, y;
+  lite::Tensor out, pad_begin, out_ref, pad_begin_ref;
+
+  LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}};
+  prepare_input(&x, x_lod, get_max_len(y_lod));
+  prepare_input(&y, y_lod, 1);
+
+  operators::AttentionPaddingMaskParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.pad_id = 12800001;
+  param.mask = -90000000.f;
+  param.Out = &out;
+  param.pad_begin = &pad_begin;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  AttentionPaddingMaskCompute<float> attention_padding_mask_kernel;
+  attention_padding_mask_kernel.SetParam(param);
+  attention_padding_mask_kernel.SetContext(std::move(ctx));
+  attention_padding_mask_kernel.Run();
+
+  attention_padding_mask_ref(x, y, &out_ref, &pad_begin_ref, param);
+  auto out_data = out.data<float>();
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_attention_padding_mask, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/batch_norm_compute.cc b/lite/kernels/loongarch/batch_norm_compute.cc
new file mode 100644
index 00000000000..df21130cc21
--- /dev/null
+++ b/lite/kernels/loongarch/batch_norm_compute.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/batch_norm_compute.h"
+
+REGISTER_LITE_KERNEL(batch_norm,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::BatchNormCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sync_batch_norm,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::BatchNormCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/batch_norm_compute.h b/lite/kernels/loongarch/batch_norm_compute.h
new file mode 100644
index 00000000000..4576445633d
--- /dev/null
+++ b/lite/kernels/loongarch/batch_norm_compute.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <random>
+#include <string>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/operators/batch_norm_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T>
+class BatchNormCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+  void Run() override {
+    // auto &context = ctx_->As<LoongArchContext>();
+    auto &param = *param_.get_mutable<operators::BatchNormParam>();
+    param.is_test = true;
+    bool global_stats = param.is_test || param.use_global_stats;
+
+    const auto *x = param.x;
+    const auto &x_dims = x->dims();
+    CHECK(x_dims.size() >= 2 && x_dims.size() <= 5);
+    const int N = x_dims[0];
+    const int C = param.data_layout == DATALAYOUT(kNCHW)
+                      ? x_dims[1]
+                      : x_dims[x_dims.size() - 1];
+    const int sample_size = x->dims().production() / N / C;
+
+    // alloc memory
+    param.y->template mutable_data<T>();
+    if (!param.is_test) {
+      param.mean_out->template mutable_data<T>();
+      param.variance_out->template mutable_data<T>();
+      param.saved_mean->template mutable_data<T>();
+      param.saved_variance->template mutable_data<T>();
+    }
+    if (!global_stats) {
+      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> saved_mean_e(
+          param.saved_mean->template mutable_data<T>(), C);
+      EigenVectorArrayMap<T> saved_variance_e(
+          param.saved_variance->template mutable_data<T>(), C);
+      saved_mean_e.setZero();
+      saved_variance_e.setZero();
+
+      EigenVectorArrayMap<T> running_mean_arr(
+          param.mean_out->template mutable_data<T>(), C);
+      EigenVectorArrayMap<T> running_var_arr(
+          param.variance_out->template mutable_data<T>(), C);
+
+      if ((N * sample_size) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        param.y->CopyDataFrom(*x);
+        return;
+      }
+
+      switch (param.data_layout) {
+        case DATALAYOUT(kNCHW): {
+          ConstEigenArrayMap<T> x_arr(
+              x->template data<T>(), sample_size, N * C);
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_mean_e(nc % C) += x_arr.col(nc).sum();
+          }
+          saved_mean_e /= N * sample_size;
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_variance_e(nc % C) +=
+                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unknown storage order: "
+                     << DataLayoutToStr(param.data_layout);
+          break;
+      }
+      running_mean_arr = running_mean_arr * param.momentum +
+                         saved_mean_e * (1. - param.momentum);
+      running_var_arr = running_var_arr * param.momentum +
+                        saved_variance_e * (1. - param.momentum);
+    }
+
+    // use SavedMean and SavedVariance to do normalize
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    if (global_stats) {
+      ConstEigenVectorArrayMap<T> var_arr(param.variance->template data<T>(),
+                                          C);
+      inv_std = (var_arr + param.epsilon).sqrt().inverse();
+    } else {
+      EigenVectorArrayMap<T> saved_inv_std(
+          param.saved_variance->template mutable_data<T>(), C);
+      // inverse SavedVariance first, gradient will use it too.
+      saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
+      inv_std = saved_inv_std;
+    }
+
+    ConstEigenVectorArrayMap<T> mean_arr(
+        global_stats ? param.mean->template data<T>()
+                     : param.saved_mean->template data<T>(),
+        C);
+
+    //   ((x - est_mean) * (inv_var) * scale + bias
+    //   formula transform ====>
+    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+
+    ConstEigenVectorArrayMap<T> scale_arr(param.scale->template data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(param.bias->template data<T>(), C);
+    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+        bias_arr - mean_arr * inv_std * scale_arr;
+
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW): {
+        EigenArrayMap<T> y_arr(
+            param.y->template mutable_data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> x_arr(x->template data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+        }
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+  }
+  virtual ~BatchNormCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/batch_norm_compute_test.cc b/lite/kernels/loongarch/batch_norm_compute_test.cc
new file mode 100644
index 00000000000..36a853b7b38
--- /dev/null
+++ b/lite/kernels/loongarch/batch_norm_compute_test.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/batch_norm_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(batch_norm_loongarch, retrive_op) {
+  auto batch_norm = KernelRegistry::Global().Create("batch_norm");
+  ASSERT_FALSE(batch_norm.empty());
+  ASSERT_TRUE(batch_norm.front());
+}
+
+TEST(batch_norm_loongarch, init) {
+  BatchNormCompute<float> batch_norm;
+  ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(batch_norm.target(), TARGET(kLoongArch));
+}
+
+TEST(batch_norm_loongarch, run_test) {
+  lite::Tensor x, scale, bias, mean, variance, y, mean_out, variance_out,
+      saved_mean, saved_variance;
+  constexpr int batch_size = 2;
+  std::vector<int64_t> x_shape{batch_size, 3, 64, 64};
+  x.Resize(lite::DDim(x_shape));
+
+  std::vector<int64_t> scale_shape{3};
+  scale.Resize(lite::DDim(scale_shape));
+
+  std::vector<int64_t> bias_shape{3};
+  bias.Resize(lite::DDim(bias_shape));
+
+  std::vector<int64_t> mean_shape{3};
+  mean.Resize(lite::DDim(mean_shape));
+
+  std::vector<int64_t> variance_shape{3};
+  variance.Resize(lite::DDim(variance_shape));
+
+  std::vector<int64_t> y_shape{batch_size, 3, 64, 64};
+  y.Resize(lite::DDim(y_shape));
+
+  std::vector<int64_t> mean_out_shape{3};
+  mean_out.Resize(lite::DDim(mean_out_shape));
+
+  std::vector<int64_t> variance_out_shape{3};
+  variance_out.Resize(lite::DDim(variance_out_shape));
+
+  std::vector<int64_t> saved_mean_shape{3};
+  saved_mean.Resize(lite::DDim(saved_mean_shape));
+
+  std::vector<int64_t> saved_variance_shape{3};
+  saved_variance.Resize(lite::DDim(saved_variance_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto scale_data = scale.mutable_data<float>();
+  auto bias_data = bias.mutable_data<float>();
+  auto mean_data = mean.mutable_data<float>();
+  auto variance_data = variance.mutable_data<float>();
+  y.mutable_data<float>();
+  mean_out.mutable_data<float>();
+  variance_out.mutable_data<float>();
+  saved_mean.mutable_data<float>();
+  saved_variance.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int i = 0; i < scale.dims().production(); i++) {
+    scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
+  }
+  for (int i = 0; i < bias.dims().production(); i++) {
+    bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
+  }
+  for (int i = 0; i < mean.dims().production(); i++) {
+    mean_data[i] = static_cast<float>(i) * 0.0565f;
+  }
+  for (int i = 0; i < variance.dims().production(); i++) {
+    variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
+  }
+  // BatchNormCompute batch_norm;
+  BatchNormCompute<float> batch_norm;
+  operators::BatchNormParam param;
+
+  param.x = &x;
+  param.is_test = true;
+  param.scale = &scale;
+  param.bias = &bias;
+  param.mean = &mean;
+  param.variance = &variance;
+  param.use_global_stats = false;
+  param.epsilon = 1e-4f;
+  param.momentum = 0.9f;
+  param.y = &y;
+  param.mean_out = &mean_out;
+  param.variance_out = &variance_out;
+  param.saved_mean = &saved_mean;
+  param.saved_variance = &saved_variance;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  batch_norm.SetContext(std::move(ctx));
+  batch_norm.SetParam(param);
+  batch_norm.Run();
+
+  LOG(INFO) << "output: " << y;
+  LOG(INFO) << "mean_out: " << mean_out;
+  LOG(INFO) << "variance_out: " << mean_out;
+  LOG(INFO) << "saved_mean: " << saved_mean;
+  LOG(INFO) << "saved_variance: " << saved_variance;
+
+  /*for (int i = 0; i < y.dims().production(); i++) {
+    if(i < 5 || i > y.dims().production() - 5)
+      LOG(INFO) << y_data[i];
+  }*/
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(batch_norm, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/box_coder_compute.cc b/lite/kernels/loongarch/box_coder_compute.cc
new file mode 100644
index 00000000000..6efba638f5e
--- /dev/null
+++ b/lite/kernels/loongarch/box_coder_compute.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/box_coder_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/math/box_coder.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void BoxCoderCompute::Run() {
+  auto& param = *param_.get_mutable<operators::BoxCoderParam>();
+  // required inputs
+  auto* prior_box = param.prior_box;    // M x 4 => M x [xmin, ymin, xmax, ymax]
+  auto* target_box = param.target_box;  // encode_center_size => N x 4;
+                                        // decode_center_size => N x M x 4
+  // optional input
+  auto* prior_box_var = param.prior_box_var;  // M x 4 or 4
+  // output
+  auto* output_box = param.proposals;  // N x M x 4
+  // required attributes
+  std::string code_type = param.code_type;
+  bool normalized = param.box_normalized;
+  // optional attributes
+  std::vector<float> variance = param.variance;
+  const int axis = param.axis;
+
+  auto row = target_box->dims()[0];         // N
+  auto col = prior_box->dims()[0];          // M
+  if (code_type == "decode_center_size") {  // same as target_box
+    col = target_box->dims()[1];
+  }
+  auto len = prior_box->dims()[1];      // 4
+  output_box->Resize({row, col, len});  // N x M x 4
+  auto* output = output_box->mutable_data<float>();
+
+  const float* target_box_data = target_box->data<float>();
+  const float* prior_box_data = prior_box->data<float>();
+  const float* prior_box_var_data =
+      prior_box_var ? prior_box_var->data<float>() : nullptr;
+
+  if (code_type == "encode_center_size") {
+    lite::loongarch::math::encode_center_size(row,
+                                        col,
+                                        len,
+                                        target_box_data,
+                                        prior_box_data,
+                                        prior_box_var_data,
+                                        normalized,
+                                        variance,
+                                        output);
+  } else if (code_type == "decode_center_size") {
+    int var_size = 0;
+    if (prior_box_var) {
+      var_size = 2;
+    } else if (!(variance.empty())) {
+      var_size = 1;
+    }
+    lite::loongarch::math::decode_center_size(axis,
+                                        var_size,
+                                        row,
+                                        col,
+                                        len,
+                                        target_box_data,
+                                        prior_box_data,
+                                        prior_box_var_data,
+                                        normalized,
+                                        variance,
+                                        output);
+  } else {
+    LOG(FATAL) << "box_coder don't support this code_type: " << code_type;
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(box_coder,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::BoxCoderCompute,
+                     def)
+    .BindInput("PriorBox", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("PriorBoxVar", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("TargetBox", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("OutputBox", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/box_coder_compute.h b/lite/kernels/loongarch/box_coder_compute.h
new file mode 100644
index 00000000000..557b91c46fc
--- /dev/null
+++ b/lite/kernels/loongarch/box_coder_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+class BoxCoderCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BoxCoderParam;
+
+  void Run() override;
+
+  virtual ~BoxCoderCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/calib_compute.cc b/lite/kernels/loongarch/calib_compute.cc
new file mode 100644
index 00000000000..2b6a5af8f38
--- /dev/null
+++ b/lite/kernels/loongarch/calib_compute.cc
@@ -0,0 +1,320 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/calib_compute.h"
+
+#include <vector>
+#include "lite/backends/loongarch/fluid/float16.h"
+#include "lite/backends/loongarch/math/calib.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeFp32ToInt8<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  std::vector<float> scale = {param.scale};
+  const auto* din = param.input->template data<float>();
+  auto* dout = param.output->template mutable_data<signed char>();
+  lite::loongarch::math::fp32_to_int8(
+      din, dout, scale.data(), 1, 1, param.input->numel());
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeFp32ToFp16<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<float>();
+  auto* dout = param.output->template mutable_data<float16>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = lite::fluid::float16(din[i]).x;
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeFp16ToFp32<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<float16>();
+  auto* dout = param.output->template mutable_data<float>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = static_cast<float>(lite::fluid::raw_uint16_to_float16(din[i]));
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeInt64ToInt32<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int64_t>();
+  auto* dout = param.output->template mutable_data<int32_t>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = static_cast<int32_t>(din[i]);
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeInt32ToInt64<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int32_t>();
+  auto* dout = param.output->template mutable_data<int64_t>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = static_cast<int64_t>(din[i]);
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeInt8ToFp32<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<signed char>();
+  std::vector<float> scale = {param.scale};
+  auto* dout = param.output->template mutable_data<float>();
+  lite::loongarch::math::int8_to_fp32(
+      din, dout, scale.data(), 1, 1, param.input->numel());
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeInt32ToFp32<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int32_t>();
+  auto* dout = param.output->template mutable_data<float>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = static_cast<float>(din[i]);
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeFp32ToInt32<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<float>();
+  auto* dout = param.output->template mutable_data<int32_t>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = static_cast<int32_t>(din[i]);
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeInt32ToFp16<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int32_t>();
+  auto* dout = param.output->template mutable_data<float16>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = lite::fluid::float16(din[i]).x;
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeInt64ToFp16<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int64_t>();
+  auto* dout = param.output->template mutable_data<float16>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = lite::fluid::float16(din[i]).x;
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeFp16ToInt32<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<float16>();
+  auto* dout = param.output->template mutable_data<int32_t>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = static_cast<int32_t>(lite::fluid::raw_uint16_to_float16(din[i]));
+  }
+}
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+void CalibComputeInt64ToFp32<Ptype, DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int64_t>();
+  auto* dout = param.output->template mutable_data<float>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = static_cast<float>(din[i]);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::loongarch::CalibComputeFp32ToInt8<PRECISION(kInt8),
+                                                           DATALAYOUT(kNCHW)>
+    i8_fp32_to_int8;
+REGISTER_LITE_KERNEL(calib, kLoongArch, kInt8, kNCHW, i8_fp32_to_int8, fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeInt8ToFp32<PRECISION(kInt8),
+                                                           DATALAYOUT(kNCHW)>
+    i8_int8_to_fp32;
+REGISTER_LITE_KERNEL(calib, kLoongArch, kInt8, kNCHW, i8_int8_to_fp32, int8_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeInt32ToFp32<PRECISION(kFloat),
+                                                            DATALAYOUT(kNCHW)>
+    fp_int32_to_fp32;
+REGISTER_LITE_KERNEL(
+    calib, kLoongArch, kFloat, kNCHW, fp_int32_to_fp32, int32_to_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeFp32ToInt32<PRECISION(kFloat),
+                                                            DATALAYOUT(kNCHW)>
+    fp_fp32_to_int32;
+REGISTER_LITE_KERNEL(
+    calib, kLoongArch, kFloat, kNCHW, fp_fp32_to_int32, fp32_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeInt32ToInt64<PRECISION(kFloat),
+                                                             DATALAYOUT(kNCHW)>
+    fp_int32_to_int64;
+REGISTER_LITE_KERNEL(
+    calib, kLoongArch, kFloat, kNCHW, fp_int32_to_int64, int32_to_int64)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeInt64ToInt32<PRECISION(kFloat),
+                                                             DATALAYOUT(kNCHW)>
+    fp_int64_to_int32;
+REGISTER_LITE_KERNEL(
+    calib, kLoongArch, kFloat, kNCHW, fp_int64_to_int32, int64_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kLoongArch),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeInt64ToFp32<PRECISION(kFloat),
+                                                            DATALAYOUT(kNCHW)>
+    fp_int64_to_fp32;
+REGISTER_LITE_KERNEL(
+    calib, kLoongArch, kFloat, kNCHW, fp_int64_to_fp32, int64_to_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeFp32ToFp16<PRECISION(kFloat),
+                                                           DATALAYOUT(kNCHW)>
+    fp_fp32_to_fp16;
+REGISTER_LITE_KERNEL(calib, kLoongArch, kFloat, kNCHW, fp_fp32_to_fp16, fp32_to_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFP16))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeFp16ToFp32<PRECISION(kFP16),
+                                                           DATALAYOUT(kNCHW)>
+    fp16_fp16_to_fp32;
+REGISTER_LITE_KERNEL(calib, kLoongArch, kFP16, kNCHW, fp16_fp16_to_fp32, fp16_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kLoongArch, kInt8, kNCHW, i8_fp32_to_int8, fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kLoongArch, kInt8, kNCHW, i8_int8_to_fp32, int8_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kLoongArch, kFloat, kNCHW, fp_int32_to_fp32, int32_to_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kLoongArch, kFloat, kNCHW, fp_fp32_to_int32, fp32_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kLoongArch, kFloat, kNCHW, fp_int32_to_int64, int32_to_int64)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kLoongArch, kFloat, kNCHW, fp_int64_to_int32, int64_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kLoongArch),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kLoongArch, kFloat, kNCHW, fp_int64_to_fp32, int64_to_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+typedef paddle::lite::kernels::loongarch::CalibComputeInt32ToFp16<PRECISION(kFloat),
+                                                            DATALAYOUT(kNCHW)>
+    fp_int32_to_fp16;
+REGISTER_LITE_KERNEL(
+    calib, kLoongArch, kFloat, kNCHW, fp_int32_to_fp16, int32_to_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFP16))})
+    .Finalize();
+typedef paddle::lite::kernels::loongarch::CalibComputeFp16ToInt32<PRECISION(kFloat),
+                                                            DATALAYOUT(kNCHW)>
+    fp_fp16_to_int32;
+REGISTER_LITE_KERNEL(
+    calib, kLoongArch, kFloat, kNCHW, fp_fp16_to_int32, fp16_to_int32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+typedef paddle::lite::kernels::loongarch::CalibComputeInt64ToFp16<PRECISION(kFloat),
+                                                            DATALAYOUT(kNCHW)>
+    fp_int64_to_fp16;
+REGISTER_LITE_KERNEL(
+    calib, kLoongArch, kFloat, kNCHW, fp_int64_to_fp16, int64_to_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/calib_compute.h b/lite/kernels/loongarch/calib_compute.h
new file mode 100644
index 00000000000..49a72c77859
--- /dev/null
+++ b/lite/kernels/loongarch/calib_compute.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/operators/calib_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+typedef uint16_t float16;
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeFp32ToInt8 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFp32ToInt8() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeFp32ToFp16 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFp32ToFp16() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeFp16ToFp32 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFp16ToFp32() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeInt64ToInt32
+    : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt64ToInt32() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeInt8ToFp32 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt8ToFp32() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeInt32ToFp32 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt32ToFp32() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeInt32ToInt64
+    : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt32ToInt64() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeFp32ToInt32 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFp32ToInt32() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeInt64ToFp32 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt64ToFp32() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeInt32ToFp16 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt32ToFp16() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeInt64ToFp16 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt64ToFp16() override{};
+
+ private:
+};
+
+template <PrecisionType Ptype, DataLayoutType DLType>
+class CalibComputeFp16ToInt32 : public KernelLite<TARGET(kLoongArch), Ptype, DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFp16ToInt32() override{};
+
+ private:
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/cast_compute.cc b/lite/kernels/loongarch/cast_compute.cc
new file mode 100644
index 00000000000..b4581a6e940
--- /dev/null
+++ b/lite/kernels/loongarch/cast_compute.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/cast_compute.h"
+
+REGISTER_LITE_KERNEL(cast,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::CastCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    cast,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::CastCompute<::paddle::lite::fluid::float16>,
+    fp16_to_any)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(cast,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::CastCompute<bool>,
+                     bool_to_any)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kBool))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(cast,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::CastCompute<int>,
+                     int32_to_any)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/cast_compute.h b/lite/kernels/loongarch/cast_compute.h
new file mode 100644
index 00000000000..565d680738c
--- /dev/null
+++ b/lite/kernels/loongarch/cast_compute.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/loongarch/fluid/data_type.h"
+#include "lite/backends/loongarch/fluid/hostdevice.h"
+#include "lite/backends/loongarch/fluid/transform.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <lite::TargetType Target, typename InT>
+class CastOpFunctor {
+ public:
+  CastOpFunctor(const lite::Tensor* in,
+                lite::Tensor* out,
+                const lite::Context<Target>& context)
+      : input(in), output(out), ctx(context) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto* in_begin = input->data<InT>();
+    auto numel = input->dims().production();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = output->mutable_data<OutT>();
+    paddle::lite::fluid::Transform<lite::TargetType::kLoongArch> trans;
+    trans(
+        ctx, in_begin, in_end, out_begin, CastOpTransformFunctor<InT, OutT>());
+  }
+
+ private:
+  const lite::Tensor* input;
+  lite::Tensor* output;
+  const lite::Context<Target>& ctx;
+};
+
+template <typename InT>
+class CastCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::CastParam;
+
+  void Run() override {
+    auto param = param_.get_mutable<param_t>();
+    auto& context = ctx_->As<LoongArchContext>();
+    auto x = param->X;
+    auto out = param->Out;
+    auto out_dtype = param->out_dtype;
+    paddle::lite::fluid::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype),
+        CastOpFunctor<lite::TargetType::kLoongArch, InT>(x, out, context));
+  }
+  virtual ~CastCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/cast_compute_test.cc b/lite/kernels/loongarch/cast_compute_test.cc
new file mode 100644
index 00000000000..544911aaf8d
--- /dev/null
+++ b/lite/kernels/loongarch/cast_compute_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/cast_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(cast_loongarch, retrive_op) {
+  auto cast = KernelRegistry::Global().Create("cast");
+  ASSERT_FALSE(cast.empty());
+  ASSERT_TRUE(cast.front());
+}
+
+TEST(cast_loongarch, init) {
+  CastCompute<float> cast;
+  ASSERT_EQ(cast.precision(), PRECISION(kFloat));
+  ASSERT_EQ(cast.target(), TARGET(kLoongArch));
+}
+
+TEST(cast_loongarch, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 1, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+
+  std::vector<int64_t> out_shape{batch_size, 1, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<int32_t>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(1);
+  }
+
+  CastCompute<float> cast;
+  operators::CastParam param;
+  param.X = &x;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  cast.SetContext(std::move(ctx));
+  cast.SetParam(param);
+  cast.Run();
+
+  std::vector<int32_t> ref_results = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(cast, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/clip_compute.cc b/lite/kernels/loongarch/clip_compute.cc
new file mode 100644
index 00000000000..52c9692fe3c
--- /dev/null
+++ b/lite/kernels/loongarch/clip_compute.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/clip_compute.h"
+
+REGISTER_LITE_KERNEL(clip,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ClipCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Min", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Max", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindPaddleOpVersion("clip", 1)
+    .Finalize();
diff --git a/lite/kernels/loongarch/clip_compute.h b/lite/kernels/loongarch/clip_compute.h
new file mode 100644
index 00000000000..0f1dbf4feaa
--- /dev/null
+++ b/lite/kernels/loongarch/clip_compute.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/loongarch/math/clip.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/clip_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class ClipCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ClipParam;
+
+  void Run() override {
+    auto& param = Param<operators::ClipParam>();
+    lite::Tensor* x = param.x;
+    lite::Tensor* min_tensor = param.min_tensor;
+    lite::Tensor* max_tensor = param.max_tensor;
+    lite::Tensor* out = param.out;
+    T min = param.min;
+    T max = param.max;
+
+    if (min_tensor != nullptr) {
+      min = min_tensor->data<T>()[0];
+    }
+    if (max_tensor != nullptr) {
+      max = max_tensor->data<T>()[0];
+    }
+
+    const T* x_ptr = x->data<T>();
+    T* out_ptr = out->mutable_data<T>();
+    int64_t num = x->numel();
+
+    lite::loongarch::math::clip(x_ptr, out_ptr, num, max, min);
+  }
+
+  virtual ~ClipCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/concat_compute.cc b/lite/kernels/loongarch/concat_compute.cc
new file mode 100644
index 00000000000..f0064b2ad39
--- /dev/null
+++ b/lite/kernels/loongarch/concat_compute.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/concat_compute.h"
+
+REGISTER_LITE_KERNEL(concat,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ConcatCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(concat,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ConcatCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(concat,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ConcatCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/concat_compute.h b/lite/kernels/loongarch/concat_compute.h
new file mode 100644
index 00000000000..0b1c40f71b8
--- /dev/null
+++ b/lite/kernels/loongarch/concat_compute.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+inline int count(int start_axis, int end_axis, const lite::DDim& dim) {
+  int count = 1;
+  for (int i = start_axis; i < end_axis; ++i) {
+    count *= dim[i];
+  }
+  return count;
+}
+
+template <typename T>
+class ConcatCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    if (param.x.size() == 1) {
+      param.output->ShareDataWith(*param.x[0]);
+      return;
+    }
+
+    int axis = param.axis;
+    auto* axis_tensor = param.axis_tensor;
+    if (axis_tensor != nullptr) {
+      auto* axis_tensor_data = axis_tensor->template data<int>();
+      axis = axis_tensor_data[0];
+    }
+    const auto& x_dims = param.x[0]->dims();
+    if (axis < 0) {
+      axis += static_cast<int>(x_dims.size());
+    }
+
+    auto* out = param.output;
+    T* output_data = param.output->template mutable_data<T>();
+
+    int offset_concat_axis = 0;
+    int num_concat = count(0, axis, x_dims);
+    int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
+    const int top_concat_axis = out->dims()[axis];
+    for (size_t i = 0; i < param.x.size(); ++i) {
+      const T* bottom_data = param.x[i]->template data<T>();
+      const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
+      for (int n = 0; n < num_concat; ++n) {
+        std::memcpy(
+            output_data +
+                (n * top_concat_axis + offset_concat_axis) * concat_input_size,
+            bottom_data + n * bottom_concat_axis * concat_input_size,
+            (bottom_concat_axis * concat_input_size) * sizeof(T));
+      }
+      offset_concat_axis += bottom_concat_axis;
+    }
+  }
+  virtual ~ConcatCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/conv_compute.cc b/lite/kernels/loongarch/conv_compute.cc
new file mode 100644
index 00000000000..c10e1cf35b4
--- /dev/null
+++ b/lite/kernels/loongarch/conv_compute.cc
@@ -0,0 +1,552 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/conv_compute.h"
+#include <utility>
+#include "lite/backends/loongarch/math/fill_bias_activate.h"
+#include "lite/kernels/loongarch/conv_depthwise.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+#define INIT_PARAM                      \
+  auto& param = this->Param<param_t>(); \
+  auto x_dims = param.x->dims();        \
+  auto w_dims = param.filter->dims();   \
+  auto o_dims = param.output->dims();   \
+  int win = x_dims[3];                  \
+  int hin = x_dims[2];                  \
+  int chin = x_dims[1];                 \
+  int num = x_dims[0];                  \
+  int wout = o_dims[3];                 \
+  int hout = o_dims[2];                 \
+  int chout = o_dims[1];                \
+  int kw = w_dims[3];                   \
+  int kh = w_dims[2];                   \
+  int group = param.groups;             \
+  int m = chout / group;                \
+  int n = hout * wout;                  \
+  int k = chin * kw * kh / group;
+
+#define PREPARE_PARAM                                                         \
+  auto& param = this->Param<param_t>();                                       \
+  const int input_channel = param.x->dims()[1];                               \
+  const int output_channel = param.filter->dims()[0];                         \
+  const int groups = param.groups;                                            \
+  const int kernel_h = param.filter->dims()[2];                               \
+  const int kernel_w = param.filter->dims()[3];                               \
+  const int stride_h = param.strides[0];                                      \
+  const int stride_w = param.strides[1];                                      \
+  auto paddings = *param.paddings;                                            \
+  auto dilations = *param.dilations;                                          \
+  bool dw_kernel = (input_channel == groups && output_channel == groups);     \
+  bool ks_equal = (stride_h == stride_w) && (kernel_h == kernel_w);           \
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);              \
+  bool kps_equal = (paddings[0] == paddings[2]) && ks_equal;                  \
+  bool pads_equal =                                                           \
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));         \
+  bool flag_dw_3x3 =                                                          \
+      (kernel_h == 3) && (kernel_w == 3) && (stride_h == 1 || stride_h == 2); \
+  bool flag_dw_5x5 =                                                          \
+      (kernel_h == 5) && (kernel_w == 5) && (stride_h == 1 || stride_h == 2);
+
+#define PREPARE_PARAM_INT8                                          \
+  auto& param = this->Param<param_t>();                             \
+  const int input_channel = param.x->dims()[1];                     \
+  const int output_channel = param.filter->dims()[0];               \
+  const int groups = param.groups;                                  \
+  const int kernel_h = param.filter->dims()[2];                     \
+  const int kernel_w = param.filter->dims()[3];                     \
+  const int stride_h = param.strides[0];                            \
+  const int stride_w = param.strides[1];                            \
+  auto paddings = *param.paddings;                                  \
+  auto dilations = *param.dilations;                                \
+  bool ks_equal = (stride_h == stride_w) && (kernel_h == kernel_w); \
+  bool kps_equal = (paddings[0] == paddings[2]) && ks_equal;        \
+  bool pads_equal =                                                 \
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+
+template <>
+void Conv2dCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
+  PREPARE_PARAM
+  //! todo add conv_5x5_depthwise implement
+  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
+  if (kernel_w == 1 && stride_w == 1 && paddings[0] == 0 && kps_equal &&
+      pads_equal) {
+    flag_1x1gemm_ = true;
+  } else {
+    flag_1x1gemm_ = false;
+  }
+
+  bool nodilations = true;
+  for (auto ele : *(param.dilations))
+    if (ele != 1) nodilations = false;
+
+  bool pad_all_equal = (paddings[0] == paddings[1]) &&
+                       (paddings[1] == paddings[2]) &&
+                       (paddings[2] == paddings[3]);
+  bool flag_p = paddings[0] <= stride_h;
+
+  //! select conv impl
+  if (dw_kernel && kps_equal && flag_dw && pads_equal &&
+      ((flag_dw_5x5 && no_dilation) || (flag_dw_3x3 && (groups & 3) == 0))) {
+    impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
+    VLOG(3) << "invoking conv_depthwise_3x3p0p1 or conv_depthwise_5x5";
+  }
+
+  // support 3x3s1p01,5x5s1p01,7x7s1p01
+  //  3x3s2p012,5x5s1p012,7x7s1p012
+  if (output_channel % 8 == 0 && groups == 1 &&
+      (kernel_h == 3 || kernel_h == 5 || kernel_h == 7) &&
+      (stride_h == 2 || stride_h == 1) && nodilations && kps_equal &&
+      pad_all_equal && flag_p) {
+#if defined(_WIN64) || defined(__MINGW64__) || \
+    (defined(__CYGWIN__) && defined(__loongarch_64__)) || defined(__loongarch_64__)
+    impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>();
+    VLOG(3) << "invoking directConv";
+#endif
+  }
+
+  if (impl_) {
+    impl_->SetContext(std::move(this->ctx_));
+    impl_->SetParam(param);
+    impl_->PrepareForRun();
+    is_first_epoch_ = false;
+  }
+}
+
+template <>
+void Conv2dCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
+  if (impl_) {
+    return impl_->Run();
+  }
+  auto& ctx = ctx_->As<LoongArchContext>();
+  INIT_PARAM
+  bool flag_bias = (param.bias != nullptr);
+  unsigned int group_size_out = m * n;
+  unsigned int group_size_weights = m * k;
+  unsigned int group_size_coldata = n * k;
+  unsigned int channel_in_size = chin * hin * win;
+  unsigned int channel_out_size = chout * hout * wout;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
+  auto din = param.x->data<float>();
+  auto dout = param.output->mutable_data<float>();
+  auto weights = param.filter->data<float>();
+  const float* bias_ptr =
+      flag_bias ? static_cast<const float*>(param.bias->data<float>())
+                : nullptr;
+  float* col_data = nullptr;
+
+  if (!flag_1x1gemm_) {
+    size_t col_size = group_size_coldata * group;
+    size_t col_data_size = static_cast<size_t>(col_size * sizeof(float));
+    col_data = static_cast<float*>(TargetMalloc(TARGET(kLoongArch), col_data_size));
+  }
+  auto act_param = param.activation_param;
+  paddle::lite::loongarch::math::Blas<lite::TargetType::kLoongArch> matmul(ctx);
+  for (int i = 0; i < num; i++) {
+    const float* din_batch = din + i * channel_in_size;
+    float* dout_batch = dout + i * channel_out_size;
+    const float* din_data = din_batch;
+    if (!flag_1x1gemm_) {
+      lite::loongarch::math::im2col<float>(din_batch,
+                                     chin,
+                                     hin,
+                                     win,
+                                     w_dims[2],
+                                     w_dims[3],
+                                     paddings[0],
+                                     paddings[1],
+                                     paddings[2],
+                                     paddings[3],
+                                     param.strides[0],
+                                     param.strides[1],
+                                     dilations[0],
+                                     dilations[1],
+                                     col_data);
+      din_data = static_cast<const float*>(col_data);
+    }
+
+    for (int g = 0; g < group; g++) {
+      const float* col_data_group = din_data + g * group_size_coldata;
+      const float* weights_group = weights + g * group_size_weights;
+      float* dout_group = dout_batch + g * group_size_out;
+      if (n == 1) {
+        matmul.GEMV<float>(
+            false, m, k, 1.f, weights_group, col_data_group, 0.f, dout_group);
+      } else {
+        matmul.GEMM<float>(false,
+                           false,
+                           m,
+                           n,
+                           k,
+                           1.f,
+                           weights_group,
+                           k,
+                           col_data_group,
+                           n,
+                           0.f,
+                           dout_group,
+                           n);
+      }
+    }
+    //! bias and activate
+    lite::loongarch::math::fill_bias_act(
+        dout_batch, bias_ptr, chout, wout * hout, flag_bias, &act_param);
+  }
+  if (!flag_1x1gemm_) TargetFree(TARGET(kLoongArch), col_data);
+}
+
+template <>
+void Conv2dCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
+  PREPARE_PARAM_INT8
+  if (kernel_w == 1 && stride_w == 1 && paddings[0] == 0 && kps_equal &&
+      pads_equal) {
+    flag_1x1gemm_ = true;
+  } else {
+    flag_1x1gemm_ = false;
+  }
+
+  auto o_dims = param.output->dims();
+  int m = output_channel / groups;
+  int n = o_dims[2] * o_dims[3];
+  int k = input_channel * kernel_h * kernel_w / groups;
+  int group_size_weights = m * k;
+  auto weights = param.filter->data<int8_t>();
+  bool flag_bias = (param.bias != nullptr);
+  const float* bias_ptr =
+      flag_bias ? static_cast<const float*>(param.bias->data<float>())
+                : nullptr;
+  auto w_scale_ = param.weight_scale;
+
+  Tensor weight_s{};
+  weight_s.Resize({param.filter->dims()[0]});
+  weight_s.set_precision(PRECISION(kFloat));
+  auto weight_tmp = weight_s.mutable_data<float>();
+
+  if (w_scale_.size() != 1 && w_scale_.size() != param.filter->dims()[0]) {
+    LOG(FATAL) << "weights scale size must equal to filter size";
+  }
+  if (w_scale_.size() == 1) {
+    for (int i = 0; i < param.filter->dims()[0]; ++i) {
+      weight_tmp[i] = (w_scale_[0]);
+    }
+  } else {
+    for (int i = 0; i < param.filter->dims()[0]; ++i) {
+      weight_tmp[i] = (w_scale_[i]);
+    }
+  }
+  auto weight_scale = weight_s.data<float>();
+  const float input_scale = param.input_scale;
+  const float output_scale = param.output_scale;
+  int relu_type = 0;
+  float relu_alpha = 1.f;
+
+  if (param.activation_param.active_type == lite_api::ActivationType::kRelu6) {
+    relu_type = 2;
+    relu_alpha = param.activation_param.Relu_clipped_coef;
+  } else if (param.activation_param.active_type ==
+             lite_api::ActivationType::kLeakyRelu) {
+    relu_type = 3;
+    relu_alpha = param.activation_param.Leaky_relu_alpha;
+  } else if (param.activation_param.active_type ==
+             lite_api::ActivationType::kRelu) {
+    relu_type = 1;
+  }
+  for (int g = 0; g < groups; g++) {
+    const int8_t* weights_group = weights + g * group_size_weights;
+    auto gemm = new lite::loongarch::math::generate_gemm_s8u8_loongarch_kern<float>(
+        false,
+        false,
+        m,
+        n,
+        k,
+        weights_group,
+        n,
+        weight_scale + g * m,
+        input_scale,
+        output_scale,
+        bias_ptr + g * m,
+        relu_type,
+        relu_alpha);
+    gemm_s8_ptr_float_.push_back(gemm);
+  }
+}
+
+template <>
+void Conv2dCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
+  INIT_PARAM
+  int group_size_coldata = n * k;
+  int channel_size_in = hin * win;
+  int channel_size_out = hout * wout;
+  int chin_per_group = chin / group;
+  int group_size_weights = m * k;
+  int8_t* col_data = nullptr;
+  auto din = param.x->data<int8_t>();
+  auto dout = param.output->mutable_data<float>();
+  auto weights = param.filter->data<int8_t>();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
+  if (!flag_1x1gemm_) {
+    int col_size = group * group_size_coldata;
+    col_data = static_cast<int8_t*>(
+        TargetMalloc(TARGET(kLoongArch), col_size * sizeof(int8_t)));
+  }
+  for (int b = 0; b < num; ++b) {
+    for (int g = 0; g < group; ++g) {
+      float* dout_group = dout + (b * chout + g * m) * channel_size_out;
+      const int8_t* din_group =
+          din + (b * chin + g * chin_per_group) * channel_size_in;
+      const int8_t* weights_group = weights + g * group_size_weights;
+
+      if (!flag_1x1gemm_) {
+        lite::loongarch::math::im2col<int8_t>(din_group,
+                                        chin_per_group,
+                                        hin,
+                                        win,
+                                        kh,
+                                        kw,
+                                        paddings[0],
+                                        paddings[1],
+                                        paddings[2],
+                                        paddings[3],
+                                        param.strides[0],
+                                        param.strides[1],
+                                        dilations[0],
+                                        dilations[1],
+                                        col_data);
+        gemm_s8_ptr_float_[g]->compute(weights_group, col_data, dout_group);
+      } else {
+        gemm_s8_ptr_float_[g]->compute(weights_group, din_group, dout_group);
+      }
+    }
+  }
+  if (!flag_1x1gemm_) TargetFree(TARGET(kLoongArch), col_data);
+}
+
+template <>
+void Conv2dCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
+  PREPARE_PARAM_INT8
+  if (kernel_w == 1 && stride_w == 1 && paddings[0] == 0 && kps_equal &&
+      pads_equal) {
+    flag_1x1gemm_ = true;
+  } else {
+    flag_1x1gemm_ = false;
+  }
+
+  auto o_dims = param.output->dims();
+  int m = output_channel / groups;
+  int n = o_dims[2] * o_dims[3];
+  int k = input_channel * kernel_h * kernel_w / groups;
+  int group_size_weights = m * k;
+  auto weights = param.filter->data<int8_t>();
+  bool flag_bias = (param.bias != nullptr);
+  const float* bias_ptr =
+      flag_bias ? static_cast<const float*>(param.bias->data<float>())
+                : nullptr;
+  auto w_scale_ = param.weight_scale;
+  Tensor weight_s{};
+  weight_s.Resize({param.filter->dims()[0]});
+  weight_s.set_precision(PRECISION(kFloat));
+  auto weight_tmp = weight_s.mutable_data<float>();
+
+  if (w_scale_.size() != 1 && w_scale_.size() != param.filter->dims()[0]) {
+    LOG(FATAL) << "weights scale size must equal to filter size";
+  }
+  if (w_scale_.size() == 1) {
+    for (int i = 0; i < param.filter->dims()[0]; ++i) {
+      weight_tmp[i] = (w_scale_[0]);
+    }
+  } else {
+    for (int i = 0; i < param.filter->dims()[0]; ++i) {
+      weight_tmp[i] = (w_scale_[i]);
+    }
+  }
+  const float input_scale = param.input_scale;
+  const float output_scale = param.output_scale;
+  int relu_type = 0;
+  float relu_alpha = 1.f;
+
+  if (param.activation_param.has_active) {
+    if (param.activation_param.active_type ==
+        lite_api::ActivationType::kRelu6) {
+      relu_type = 2;
+      relu_alpha = param.activation_param.Relu_clipped_coef / output_scale;
+    } else if (param.activation_param.active_type ==
+               lite_api::ActivationType::kLeakyRelu) {
+      relu_type = 3;
+      relu_alpha = param.activation_param.Leaky_relu_alpha / output_scale;
+    } else if (param.activation_param.active_type ==
+               lite_api::ActivationType::kRelu) {
+      relu_type = 1;
+    }
+  }
+
+  auto weight_scale = weight_s.data<float>();
+  for (int g = 0; g < groups; g++) {
+    const int8_t* weights_group = weights + g * group_size_weights;
+    auto gemm = new lite::loongarch::math::generate_gemm_s8u8_loongarch_kern<int8_t>(
+        false,
+        false,
+        m,
+        n,
+        k,
+        weights_group,
+        n,
+        weight_scale + g * m,
+        input_scale,
+        output_scale,
+        bias_ptr + g * m,
+        relu_type,
+        relu_alpha);
+    gemm_s8_ptr_int8_.push_back(gemm);
+  }
+}
+
+template <>
+void Conv2dCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
+  INIT_PARAM
+  int group_size_coldata = n * k;
+  int channel_size_in = hin * win;
+  int channel_size_out = hout * wout;
+  int chin_per_group = chin / group;
+  int group_size_weights = m * k;
+  int8_t* col_data = nullptr;
+  auto din = param.x->data<int8_t>();
+  auto dout = param.output->mutable_data<int8_t>();
+  auto weights = param.filter->data<int8_t>();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
+  if (!flag_1x1gemm_) {
+    int col_size = group * group_size_coldata;
+    col_data = static_cast<int8_t*>(
+        TargetMalloc(TARGET(kLoongArch), col_size * sizeof(int8_t)));
+  }
+  for (int b = 0; b < num; ++b) {
+    for (int g = 0; g < group; ++g) {
+      int8_t* dout_group = dout + (b * chout + g * m) * channel_size_out;
+      const int8_t* din_group =
+          din + (b * chin + g * chin_per_group) * channel_size_in;
+      const int8_t* weights_group = weights + g * group_size_weights;
+
+      if (!flag_1x1gemm_) {
+        lite::loongarch::math::im2col<int8_t>(din_group,
+                                        chin_per_group,
+                                        hin,
+                                        win,
+                                        kh,
+                                        kw,
+                                        paddings[0],
+                                        paddings[1],
+                                        paddings[2],
+                                        paddings[3],
+                                        param.strides[0],
+                                        param.strides[1],
+                                        dilations[0],
+                                        dilations[1],
+                                        col_data);
+        gemm_s8_ptr_int8_[g]->compute(weights_group, col_data, dout_group);
+      } else {
+        gemm_s8_ptr_int8_[g]->compute(weights_group, din_group, dout_group);
+      }
+    }
+  }
+  if (!flag_1x1gemm_) TargetFree(TARGET(kLoongArch), col_data);
+}
+
+#undef PREPARE_PARAM
+#undef PREPARE_PARAM_INT8
+#undef INIT_PARAM
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::loongarch::Conv2dCompute<PRECISION(kFloat),
+                                                  PRECISION(kFloat)>
+    ConvFp32;
+typedef paddle::lite::kernels::loongarch::Conv2dCompute<PRECISION(kInt8),
+                                                  PRECISION(kFloat)>
+    ConvInt8_Fp32;
+typedef paddle::lite::kernels::loongarch::Conv2dCompute<PRECISION(kInt8),
+                                                  PRECISION(kInt8)>
+    ConvInt8_Int8;
+
+REGISTER_LITE_KERNEL(conv2d, kLoongArch, kFloat, kNCHW, ConvFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("SecondInput", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindPaddleOpVersion("conv2d", 1)
+    .Finalize();
+
+REGISTER_LITE_KERNEL(depthwise_conv2d, kLoongArch, kFloat, kNCHW, ConvFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindPaddleOpVersion("depthwise_conv2d", 1)
+    .Finalize();
+
+REGISTER_LITE_KERNEL(conv2d, kLoongArch, kInt8, kNCHW, ConvInt8_Int8, int8_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindInput("SecondInput",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindPaddleOpVersion("conv2d", 1)
+    .Finalize();
+
+REGISTER_LITE_KERNEL(conv2d, kLoongArch, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindInput("SecondInput",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindPaddleOpVersion("conv2d", 1)
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    depthwise_conv2d, kLoongArch, kInt8, kNCHW, ConvInt8_Int8, int8_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindPaddleOpVersion("depthwise_conv2d", 1)
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    depthwise_conv2d, kLoongArch, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindPaddleOpVersion("depthwise_conv2d", 1)
+    .Finalize();
diff --git a/lite/kernels/loongarch/conv_compute.h b/lite/kernels/loongarch/conv_compute.h
new file mode 100644
index 00000000000..81821229bfc
--- /dev/null
+++ b/lite/kernels/loongarch/conv_compute.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/math/common/conv_utils.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/conv_bias.h"
+#include "lite/backends/loongarch/math/gemm_s8u8_compute.h"
+#include "lite/backends/loongarch/math/im2col.h"
+#include "lite/backends/loongarch/math/vol2col.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+template <PrecisionType Ptype, PrecisionType OutType>
+class Conv2dCompute : public KernelLite<TARGET(kLoongArch), Ptype> {
+ public:
+  virtual void PrepareForRun();
+
+  virtual void ReInitWhenNeeded() {
+    if (impl_) {
+      impl_->ReInitWhenNeeded();
+    }
+  }
+
+  virtual void Run();
+
+#ifdef LITE_WITH_PROFILE
+  std::string kernel_func_name_{"Conv2d"};
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = "NotImplForConv";
+  }
+#endif
+
+  ~Conv2dCompute() {
+    if (impl_ != nullptr) {
+      delete impl_;
+    }
+    for (int i = 0; i < gemm_s8_ptr_float_.size(); i++)
+      delete gemm_s8_ptr_float_[i];
+    for (int i = 0; i < gemm_s8_ptr_int8_.size(); i++)
+      delete gemm_s8_ptr_int8_[i];
+  }
+
+ private:
+  using param_t = operators::ConvParam;
+  KernelLite<TARGET(kLoongArch), Ptype>* impl_{nullptr};
+  Context<TargetType::kLoongArch>* device_ctx;
+  bool flag_1x1gemm_{false};
+  bool flag_trans_bias_{true};
+  std::vector<float> w_scale_;
+  Tensor weights_;
+  Tensor bias_;
+  std::vector<lite::loongarch::math::generate_gemm_s8u8_loongarch_kern<float>*>
+      gemm_s8_ptr_float_{};
+  std::vector<lite::loongarch::math::generate_gemm_s8u8_loongarch_kern<int8_t>*>
+      gemm_s8_ptr_int8_{};
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/conv_compute_test.cc b/lite/kernels/loongarch/conv_compute_test.cc
new file mode 100644
index 00000000000..e470785219a
--- /dev/null
+++ b/lite/kernels/loongarch/conv_compute_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/conv_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(conv_loongarch, retrive_op) {
+  auto conv2d = KernelRegistry::Global().Create("conv2d");
+  ASSERT_FALSE(conv2d.empty());
+  ASSERT_TRUE(conv2d.front());
+}
+
+TEST(conv2d_loongarch, init) {
+  Conv2dCompute<PRECISION(kFloat), PRECISION(kFloat)> conv2d;
+  ASSERT_EQ(conv2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(conv2d.target(), TARGET(kLoongArch));
+}
+
+TEST(conv2d_loongarch, run_test) {
+  lite::Tensor x, filter, b, out;
+  const int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> filter_shape{1, 3, 3, 3};
+  filter.Resize(lite::DDim(filter_shape));
+  std::vector<int64_t> b_shape{1, 3, 1, 1};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{batch_size, 1, 1, 1};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto filter_data = filter.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = 1;
+  }
+  for (int64_t i = 0; i < filter.dims().production(); i++) {
+    filter_data[i] = 1;
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = 0;
+  }
+
+  Conv2dCompute<PRECISION(kFloat), PRECISION(kFloat)> conv2d;
+  operators::ConvParam param;
+
+  param.x = &x;
+  param.filter = &filter;
+  param.bias = &b;
+  param.output = &out;
+  param.strides = {1, 1};
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.groups = 1;
+  std::vector<int> dilations = {1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
+  LOG(INFO) << 123;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  conv2d.SetContext(std::move(ctx));
+  conv2d.SetParam(param);
+  conv2d.Run();
+
+  LOG(INFO) << "output: ";
+  float ref_result[1] = {27.};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_result[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(conv2d, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/conv_depthwise.cc b/lite/kernels/loongarch/conv_depthwise.cc
new file mode 100644
index 00000000000..0098c407e51
--- /dev/null
+++ b/lite/kernels/loongarch/conv_depthwise.cc
@@ -0,0 +1,285 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/conv_depthwise.h"
+#include "lite/backends/loongarch/math/lsx/conv_depthwise_pack4.h"
+#include "lite/backends/loongarch/math/lasx/conv_depthwise_pack8.h"
+#include "lite/backends/loongarch/math/common/conv_utils.h"
+#include "lite/backends/loongarch/math/conv_depthwise_impl.h"
+#include "lite/backends/loongarch/math/conv_depthwise_int8.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+#define CONV_DW_PARAM                                                         \
+  i_data, o_data, bs, oc, oh, ow, ic, ih, iw, w_data, b_data, pad, flag_bias, \
+      act_param
+
+template <>
+void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {}
+
+template <>
+void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(this->ctx_);
+
+  auto input_dims = param.x->dims();
+  CHECK_EQ(input_dims.size(), 4UL);
+
+  const auto* i_data = param.x->data<float>();
+  const auto* w_data = param.filter->data<float>();
+  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  auto act_param = param.activation_param;
+  const auto stride = param.strides[1];
+  auto pad = (*param.paddings)[2];
+  bool flag_bias = param.bias != nullptr;
+  auto* o_data = param.output->mutable_data<float>();
+  auto dilations = *param.dilations;
+  bool pad_less = pad < 2;
+
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  int iw = x_dims[3];
+  int ih = x_dims[2];
+  int ic = x_dims[1];
+  int bs = x_dims[0];
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  int oc = o_dims[1];
+  int kh = w_dims[2];
+
+  if (kh == 3) {
+    if ((dilations[0] == 1) && (dilations[1] == 1) && pad_less) {
+      if (stride == 1) {
+        lite::loongarch::math::conv_depthwise_3x3s1_p01_direct(CONV_DW_PARAM);
+      } else if (stride == 2) {
+        lite::loongarch::math::conv_depthwise_3x3s2_p01_direct(CONV_DW_PARAM);
+      }
+    } else {
+      lite::loongarch::math::conv_depthwise_3x3_pack(
+          param, &input_padding_, &input_pack_, &filter_pack_, &output_pack_);
+    }
+  } else if (kh == 5) {
+    if (stride == 1) {
+      lite::loongarch::math::conv_depthwise_5x5s1(CONV_DW_PARAM);
+    } else if (stride == 2) {
+      lite::loongarch::math::conv_depthwise_5x5s2(CONV_DW_PARAM);
+    }
+  } else {
+    LOG(FATAL) << "kw and kh only support 3 or 5";
+  }
+  KERNEL_FUNC_NAME("conv_depthwise_direct")
+}
+
+PROFILE_INFO(kFloat, kFloat)
+
+template <>
+void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  //! update scale
+  w_scale_ = param.weight_scale;
+  if (w_scale_.size() != 1 && w_scale_.size() != param.filter->dims()[0]) {
+    LOG(FATAL) << "weights scale size must equal to filter size";
+    return;
+  }
+  if (w_scale_.size() == 1) {
+    for (int i = 0; i < param.filter->dims()[0] - 1; ++i) {
+      w_scale_.push_back(w_scale_[0]);
+    }
+  }
+  float input_scale = param.input_scale;
+  for (auto& ws : w_scale_) {
+    ws *= input_scale;
+  }
+}
+
+#define CONV_DW_INT8_PARAM                                              \
+  o_data, i_data, w_data, b_data, bs, ic, ih, iw, oh, ow, pad_h, pad_w, \
+      flag_act, alpha, w_scale_.data(), &ctx
+template <>
+void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(this->ctx_);
+  auto& ctx = this->ctx_->template As<LoongArchContext>();
+  const auto* i_data = param.x->data<int8_t>();
+  const auto* w_data = param.filter->data<int8_t>();
+  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  if (flag_trans_bias_) {
+    b_data = bias_.data<float>();
+  }
+  auto* o_data = param.output->mutable_data<float>();
+
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  int iw = x_dims[3];
+  int ih = x_dims[2];
+  int ic = x_dims[1];
+  int bs = x_dims[0];
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  auto padding = *param.paddings;
+  int pad_h = padding[0];
+  int pad_w = padding[2];
+
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  float alpha = 0.f;
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      alpha = act_param.Relu_clipped_coef;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      alpha = act_param.Leaky_relu_alpha;
+    }
+  }
+
+  if (w_dims[2] == 3 && param.strides[0] == 1) {
+    lite::loongarch::math::conv_3x3s1_dw_int8(CONV_DW_INT8_PARAM);
+  } else if (w_dims[2] == 3 && param.strides[0] == 2) {
+    if (padding[0] == 0) {
+      lite::loongarch::math::conv_3x3s2p0_dw_int8(CONV_DW_INT8_PARAM);
+    } else if (padding[0] == 1) {
+      lite::loongarch::math::conv_3x3s2p1_dw_int8(CONV_DW_INT8_PARAM);
+    } else {
+      LOG(FATAL) << "LoongArch doesn't support paddings >= 2, now padding: "
+                 << padding[0];
+    }
+  } else {
+    LOG(FATAL) << "LoongArch doesn't support other depthwise, now kernel: "
+               << w_dims[2] << ", and "
+               << "strides: " << param.strides[0];
+  }
+}
+
+PROFILE_INFO(kInt8, kFloat)
+
+template <>
+void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  //! update scale
+  w_scale_ = param.weight_scale;
+  if (w_scale_.size() != 1 && w_scale_.size() != param.filter->dims()[0]) {
+    LOG(FATAL) << "weights scale size must equal to filter size";
+    return;
+  }
+  if (w_scale_.size() == 1) {
+    for (int i = 0; i < param.filter->dims()[0] - 1; ++i) {
+      w_scale_.push_back(w_scale_[0]);
+    }
+  }
+  float input_scale = param.input_scale;
+  for (auto& ws : w_scale_) {
+    ws *= input_scale;
+  }
+  //!  update bias
+  if (param.bias) {
+    bias_.Resize(param.bias->dims());
+    auto ptr = bias_.mutable_data<float>();
+    auto ptr_in = param.bias->data<float>();
+    for (int i = 0; i < bias_.numel(); ++i) {
+      ptr[i] = ptr_in[i] / param.output_scale;
+    }
+    flag_trans_bias_ = true;
+  }
+  //! update relu6 parameter
+  if (param.activation_param.active_type == lite_api::ActivationType::kRelu6) {
+    param.activation_param.Relu_clipped_coef =
+        param.activation_param.Relu_clipped_coef / param.output_scale;
+  }
+  //! update leakyRelu parameter
+  if (param.activation_param.active_type ==
+      lite_api::ActivationType::kLeakyRelu) {
+    param.activation_param.Leaky_relu_alpha =
+        param.activation_param.Leaky_relu_alpha / param.output_scale;
+  }
+}
+
+template <>
+void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(this->ctx_);
+  auto& ctx = this->ctx_->template As<LoongArchContext>();
+  const auto* i_data = param.x->data<int8_t>();
+  const auto* w_data = param.filter->data<int8_t>();
+  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  if (flag_trans_bias_) {
+    b_data = bias_.data<float>();
+  }
+  auto* o_data = param.output->mutable_data<int8_t>();
+
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  int iw = x_dims[3];
+  int ih = x_dims[2];
+  int ic = x_dims[1];
+  int bs = x_dims[0];
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  auto padding = *param.paddings;
+  int pad_h = padding[0];
+  int pad_w = padding[2];
+
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  float alpha = 0.f;
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      alpha = act_param.Relu_clipped_coef;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      alpha = act_param.Leaky_relu_alpha;
+    }
+  }
+
+  if (w_dims[2] == 3 && param.strides[0] == 1) {
+    lite::loongarch::math::conv_3x3s1_dw_int8(CONV_DW_INT8_PARAM);
+  } else if (w_dims[2] == 3 && param.strides[0] == 2) {
+    if (padding[0] == 0) {
+      lite::loongarch::math::conv_3x3s2p0_dw_int8(CONV_DW_INT8_PARAM);
+    } else if (padding[0] == 1) {
+      lite::loongarch::math::conv_3x3s2p1_dw_int8(CONV_DW_INT8_PARAM);
+    } else {
+      LOG(FATAL) << "LoongArch doesn't support paddings >= 2, now padding: "
+                 << padding[0];
+    }
+  } else {
+    LOG(FATAL) << "LoongArch doesn't support other depthwise, now kernel: "
+               << w_dims[2] << ", and "
+               << "strides: " << param.strides[0];
+  }
+}
+
+PROFILE_INFO(kInt8, kInt8)
+#undef CONV_DW_INT8_PARAM
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/conv_depthwise.h b/lite/kernels/loongarch/conv_depthwise.h
new file mode 100644
index 00000000000..fd4f87b9d5f
--- /dev/null
+++ b/lite/kernels/loongarch/conv_depthwise.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/kernel.h"
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <PrecisionType Ptype, PrecisionType OutType>
+class DepthwiseConv : public KernelLite<TARGET(kLoongArch), Ptype> {
+ public:
+  DepthwiseConv() = default;
+  ~DepthwiseConv() {}
+  void PrepareForRun() override;
+  virtual void Run();
+
+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+
+  std::string kernel_func_name_{"NotImplForConvDepthwise"};
+#define PROFILE_INFO(dtype1, dtype2)                                        \
+  template <>                                                               \
+  void DepthwiseConv<PRECISION(dtype1), PRECISION(dtype2)>::                \
+      SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { \
+    ch->kernel_func_name = kernel_func_name_;                               \
+  }
+
+#define KERNEL_FUNC_NAME(kernel_func_name) kernel_func_name_ = kernel_func_name;
+
+#else
+#define PROFILE_INFO(dtype1, dtype2)
+#define KERNEL_FUNC_NAME(kernel_func_name)
+#endif
+
+ private:
+  using param_t = operators::ConvParam;
+  Tensor input_pack_;
+  Tensor input_padding_;
+  Tensor filter_pack_;
+  Tensor output_pack_;
+  bool flag_trans_bias_{true};
+  std::vector<float> w_scale_;
+  Tensor bias_;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/conv_transpose_compute.cc b/lite/kernels/loongarch/conv_transpose_compute.cc
new file mode 100644
index 00000000000..3c1dbdb8ad5
--- /dev/null
+++ b/lite/kernels/loongarch/conv_transpose_compute.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/conv_transpose_compute.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/conv2d_transpose.h"
+#include "lite/backends/loongarch/math/fill_bias_activate.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+#define INIT_PARAM                                   \
+  auto& param = this->Param<param_t>();              \
+  auto x_dims = param.x->dims();                     \
+  auto w_dims = param.filter->dims();                \
+  auto o_dims = param.output->dims();                \
+  int win = x_dims[3];                               \
+  int hin = x_dims[2];                               \
+  int chin = x_dims[1];                              \
+  int num = x_dims[0];                               \
+  int wout = o_dims[3];                              \
+  int hout = o_dims[2];                              \
+  int chout = o_dims[1];                             \
+  int kw = w_dims[3];                                \
+  int kh = w_dims[2];                                \
+  int group = param.groups;                          \
+  /* deconv weights layout: chin * chout * kh * kw*/ \
+  int m = chout * kw * kh / group;                   \
+  int n = hin * win;                                 \
+  int k = chin / group;
+
+#define DEPTHWISE_PARAM                                                   \
+  auto dilations = *param.dilations;                                      \
+  bool ks_equal = (param.strides[0] == param.strides[1]) && (kw == kh);   \
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);          \
+  depthwise_ =                                                            \
+      (param.groups == chin && chin == chout && ks_equal && no_dilation); \
+  bool depth_wise_s1 =                                                    \
+      depthwise_ && (param.strides[0] == 1 && param.strides[1] == 1);     \
+  bool depth_wise_s2 =                                                    \
+      depthwise_ && (param.strides[0] == 2 && param.strides[1] == 2);
+
+#define DEPTHWISE_FUNCS                                                    \
+  din_batch, weights, chout, hout, wout, kh, kw, paddings[0], paddings[1], \
+      paddings[2], paddings[3], dilations[0], dilations[1], dout_batch, &ctx
+
+template <>
+void Conv2DTransposeCompute<PRECISION(kFloat),
+                            PRECISION(kFloat)>::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+  int win = x_dims[3];
+  int hin = x_dims[2];
+  int chin = x_dims[1];
+  int chout = o_dims[1];
+  int kw = w_dims[3];
+  int kh = w_dims[2];
+  int m = chout * kw * kh / param.groups;
+  int n = hin * win;
+
+  workspace_size_ = param.groups * m * n * sizeof(float);
+  auto dilations = *param.dilations;
+  bool ks_equal = (param.strides[0] == param.strides[1]) && (kw == kh);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  depthwise_ =
+      (param.groups == chin && chin == chout && ks_equal && no_dilation);
+  is_first_epoch_ = false;
+}
+
+PROFILE_INFO(kFloat, kFloat)
+template <>
+void Conv2DTransposeCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
+  auto& ctx = this->ctx_->template As<LoongArchContext>();
+  INIT_PARAM
+  bool flag_bias = (param.bias != nullptr);
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  bool pads_equal =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+
+  int group_size_in = win * hin * chin / group;
+  int group_size_weights = chin / group * chout / group * kw * kh;
+  int group_size_coldata = m * n;
+  bool pads_all_qual = pads_equal && (paddings[0] == paddings[2]);
+  bool flag_1x1s1p1 = (kw == 1) && (kh == 1) && (param.strides[0] == 1) &&
+                      (param.strides[1] == 1) && pads_all_qual &&
+                      (paddings[0] == 0) && (dilations[0] == 1) &&
+                      (dilations[1] == 1);
+  auto din = param.x->data<float>();
+  auto dout = param.output->mutable_data<float>();
+  auto weights = param.filter->data<float>();
+  auto act_param = param.activation_param;
+  bool depthwise_s1 =
+      depthwise_ && (param.strides[0] == 1 && param.strides[1] == 1);
+  bool depthwise_s2 =
+      depthwise_ && (param.strides[0] == 2 && param.strides[1] == 2);
+  const float* bias_ptr =
+      flag_bias ? static_cast<const float*>(param.bias->data<float>())
+                : nullptr;
+  float* col_data = nullptr;
+
+  if (!flag_1x1s1p1) {
+    int col_size = param.groups * group_size_coldata;
+    col_data = static_cast<float*>(
+        TargetMalloc(TARGET(kLoongArch), col_size * sizeof(float)));
+  }
+
+  for (int i = 0; i < num; i++) {
+    const float* din_batch = din + i * chin * hin * win;
+    float* dout_batch = dout + i * chout * hout * wout;
+
+    if (depthwise_s1) {
+      lite::loongarch::math::conv_transpose_depthwise_s1(DEPTHWISE_FUNCS);
+    } else if (depthwise_s2) {
+      lite::loongarch::math::conv_transpose_depthwise_s2(DEPTHWISE_FUNCS);
+    } else {
+      paddle::lite::loongarch::math::Blas<lite::TargetType::kLoongArch> matmul(ctx);
+      if (flag_1x1s1p1) {
+        col_data = dout_batch;
+      }
+      for (int g = 0; g < group; g++) {
+        const float* din_group = din_batch + g * group_size_in;
+        const float* weights_group = weights + g * group_size_weights;
+        float* coldata_group = col_data + g * group_size_coldata;
+        matmul.GEMM<float>(true,
+                           false,
+                           m,
+                           n,
+                           k,
+                           1.f,
+                           weights_group,
+                           m,
+                           din_group,
+                           n,
+                           0.f,
+                           coldata_group,
+                           n);
+      }
+      if (!flag_1x1s1p1) {
+        lite::loongarch::math::col2im(col_data,
+                                chout,
+                                hout,
+                                wout,
+                                kh,
+                                kw,
+                                paddings[0],
+                                paddings[1],
+                                paddings[2],
+                                paddings[3],
+                                param.strides[0],
+                                param.strides[1],
+                                dilations[0],
+                                dilations[1],
+                                dout_batch);
+      }
+    }
+    // bias and activate
+    lite::loongarch::math::fill_bias_act(
+        dout_batch, bias_ptr, chout, wout * hout, flag_bias, &act_param);
+  }
+  if (!flag_1x1s1p1) TargetFree(TARGET(kLoongArch), col_data);
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::loongarch::Conv2DTransposeCompute<PRECISION(kFloat),
+                                                           PRECISION(kFloat)>
+    ConvTransFp32;
+
+REGISTER_LITE_KERNEL(conv2d_transpose, kLoongArch, kFloat, kNCHW, ConvTransFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindPaddleOpVersion("conv2d_transpose", 1)
+    .Finalize();
diff --git a/lite/kernels/loongarch/conv_transpose_compute.h b/lite/kernels/loongarch/conv_transpose_compute.h
new file mode 100644
index 00000000000..0a1d9554b84
--- /dev/null
+++ b/lite/kernels/loongarch/conv_transpose_compute.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/operators/conv_transpose_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+template <PrecisionType Ptype, PrecisionType Otype>
+class Conv2DTransposeCompute : public KernelLite<TARGET(kLoongArch), Ptype> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~Conv2DTransposeCompute() = default;
+
+#ifdef LITE_WITH_PROFILE
+  std::string kernel_func_name_{"ConvTranspose"};
+
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+
+#define PROFILE_INFO(dtype1, dtype2)                                        \
+  template <>                                                               \
+  void Conv2DTransposeCompute<PRECISION(dtype1), PRECISION(dtype2)>::       \
+      SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { \
+    ch->kernel_func_name = kernel_func_name_;                               \
+  }
+
+#define KERNEL_FUNC_NAME(kernel_func_name) kernel_func_name_ = kernel_func_name;
+
+#else
+#define PROFILE_INFO(dtype1, dtype2)
+#define KERNEL_FUNC_NAME(kernel_func_name)
+#endif
+
+ protected:
+  int workspace_size_{0};
+  bool depthwise_{false};
+  bool flag_trans_bias_{false};
+  std::vector<float> w_scale_;
+  Tensor bias_;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/density_prior_box_compute.cc b/lite/kernels/loongarch/density_prior_box_compute.cc
new file mode 100644
index 00000000000..8933897ff0f
--- /dev/null
+++ b/lite/kernels/loongarch/density_prior_box_compute.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/density_prior_box_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/math/prior_box.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void DensityPriorBoxCompute::Run() {
+  auto& param = *param_.get_mutable<operators::DensityPriorBoxParam>();
+  // required inputs
+  auto* input = param.input;  // 4D tensor NCHW
+  auto* image = param.image;  // 4D tensor NCHW
+  // outputs
+  auto* boxes = param.boxes;     // [H, W, num_priors, 4]
+  auto* vars = param.variances;  // [H, W, num_priors, 4]
+  // required attributes
+  bool clip = param.clip;
+  std::vector<float> variances = param.variances_;
+  std::vector<float> fixed_sizes = param.fixed_sizes;
+  std::vector<float> fixed_ratios = param.fixed_ratios;
+  std::vector<int> densities = param.density_sizes;
+  // optional attributes
+  float step_w = param.step_w;
+  float step_h = param.step_h;
+  float offset = param.offset;
+
+  auto img_width = image->dims()[3];
+  auto img_height = image->dims()[2];
+
+  auto feature_width = input->dims()[3];
+  auto feature_height = input->dims()[2];
+
+  float step_width, step_height;
+  if (step_w == 0 || step_h == 0) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = step_w;
+    step_height = step_h;
+  }
+  int num_priors = 0;
+
+#pragma omp parallel for reduction(+ : num_priors)
+  for (int i = 0; i < densities.size(); ++i) {
+    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+  }
+
+  boxes->Resize({feature_height, feature_width, num_priors, 4});
+  vars->Resize({feature_height, feature_width, num_priors, 4});
+
+  auto* boxes_data = boxes->mutable_data<float>();
+  auto* vars_data = vars->mutable_data<float>();
+
+  const float* input_data = input->data<float>();
+  const float* image_data = image->data<float>();
+
+  lite::loongarch::math::density_prior_box(img_width,
+                                     img_height,
+                                     feature_width,
+                                     feature_height,
+                                     input_data,
+                                     image_data,
+                                     clip,
+                                     variances,
+                                     fixed_sizes,
+                                     fixed_ratios,
+                                     densities,
+                                     step_width,
+                                     step_height,
+                                     offset,
+                                     num_priors,
+                                     boxes_data,
+                                     vars_data);
+  if (param.flatten_to_2d) {
+    auto out_dims = boxes->dims();
+    int64_t sum = 1;
+    for (int i = 0; i < out_dims.size() - 1; i++) {
+      sum *= out_dims[i];
+    }
+    boxes->Resize({sum, 4});
+    vars->Resize({sum, 4});
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(density_prior_box,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::DensityPriorBoxCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Image", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/density_prior_box_compute.h b/lite/kernels/loongarch/density_prior_box_compute.h
new file mode 100644
index 00000000000..2fd9f8925d8
--- /dev/null
+++ b/lite/kernels/loongarch/density_prior_box_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+class DensityPriorBoxCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DensityPriorBoxParam;
+
+  void Run() override;
+
+  virtual ~DensityPriorBoxCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/dropout_compute.cc b/lite/kernels/loongarch/dropout_compute.cc
new file mode 100644
index 00000000000..8a5f9b638e4
--- /dev/null
+++ b/lite/kernels/loongarch/dropout_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/dropout_compute.h"
+
+REGISTER_LITE_KERNEL(dropout,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::DropoutCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Seed", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/dropout_compute.h b/lite/kernels/loongarch/dropout_compute.h
new file mode 100644
index 00000000000..031c29e2508
--- /dev/null
+++ b/lite/kernels/loongarch/dropout_compute.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <random>
+#include <string>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/operators/dropout_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class DropoutCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DropoutParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::DropoutParam>();
+    const auto* x_data = param.x->template data<T>();
+    auto* out_data = param.output->template mutable_data<T>();
+    if (!param.is_test) {
+      auto* mask_data = param.mask->template mutable_data<T>();
+      size_t size = param.mask->dims().production();
+      // Special case when dropout_prob is 1.0
+      if (param.dropout_prob == 1.0f) {
+        std::memset(out_data, 0, size * sizeof(T));   // NOLINT
+        std::memset(mask_data, 0, size * sizeof(T));  // NOLINT
+        return;
+      }
+      // std::minstd_rand engine;
+      // NOTE: fixed seed should only be used in unittest or for debug.
+      // Guarantee to use random seed in training.
+      int seed_data = 0;
+      if (param.seed_tensor->template data<int>()) {
+        seed_data = *(param.seed_tensor->template data<int>());
+      } else {
+        seed_data = param.fix_seed ? param.seed : 0;
+      }
+      std::minstd_rand engine;
+      engine.seed(seed_data);
+      std::uniform_real_distribution<float> dist(0, 1);
+
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < param.dropout_prob) {
+          mask_data[i] = 0;
+          out_data[i] = 0;
+        } else {
+          mask_data[i] = 1;
+          if (param.dropout_implementation == "upscale_in_train") {
+            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
+          } else {
+            out_data[i] = x_data[i];
+          }
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(*param.x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*param.output, 1);
+      if (param.dropout_implementation == "upscale_in_train") {
+        Y.device(lite::fluid::EigenDeviceType<lite::TargetType::kLoongArch>()) = X;
+      } else {
+        Y.device(lite::fluid::EigenDeviceType<lite::TargetType::kLoongArch>()) =
+            X * static_cast<T>(1.0f - param.dropout_prob);
+      }
+    }
+  }
+
+  virtual ~DropoutCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/dropout_compute_test.cc b/lite/kernels/loongarch/dropout_compute_test.cc
new file mode 100644
index 00000000000..b01ebd00d5c
--- /dev/null
+++ b/lite/kernels/loongarch/dropout_compute_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/dropout_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(dropout_loongarch, retrive_op) {
+  auto dropout = KernelRegistry::Global().Create("dropout");
+  ASSERT_FALSE(dropout.empty());
+  ASSERT_TRUE(dropout.front());
+}
+
+TEST(dropout_loongarch, init) {
+  DropoutCompute<float> dropout;
+  ASSERT_EQ(dropout.precision(), PRECISION(kFloat));
+  ASSERT_EQ(dropout.target(), TARGET(kLoongArch));
+}
+
+TEST(dropout_loongarch, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // DropoutCompute dropout;
+  DropoutCompute<float> dropout;
+  operators::DropoutParam param;
+
+  param.x = &x;
+  param.dropout_prob = 0.25;
+  param.is_test = true;
+  param.fix_seed = true;
+  param.output = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  dropout.SetContext(std::move(ctx));
+  dropout.SetParam(param);
+  dropout.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(dropout, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/elementwise_compute.cc b/lite/kernels/loongarch/elementwise_compute.cc
new file mode 100644
index 00000000000..724d2693e0b
--- /dev/null
+++ b/lite/kernels/loongarch/elementwise_compute.cc
@@ -0,0 +1,738 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Attention! There is no guarantee that dividing(or floordividing)
+// by 0 will get the correct result in ElementWise OP.
+
+#include "lite/kernels/loongarch/elementwise_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/math/elementwise.h"
+#include "lite/backends/loongarch/math/elementwise_common_broadcast_config.h"
+#include "lite/kernels/host/elementwise_op_func.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+namespace loongarch_math = paddle::lite::loongarch::math;
+
+// Remove trailing dimensions of size 1 for y
+static DDim trim_trailing_singular_dims(const DDim& dims) {
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
+/*
+ * Out = X point dot Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
+ * 3. force x_dims.size() is greater than y_dims.size(), else
+ *    return false.
+ */
+bool is_fast_broadcast(const DDim& x_dims,
+                       const DDim& y_dims,
+                       int axis,
+                       int* pre,
+                       int* n,
+                       int* post) {
+  if (axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  if (axis < 0) {
+    VLOG(4) << "Fast broadcast chk fail, for x_dims smaller.";
+    return false;
+  }
+  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
+  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
+  if (x_dims.size() < (y_dim_trim.size() + axis)) {
+    VLOG(4) << "Fast broadcast chk fail, for y's shape size doesnt follow the "
+               "axis rule";
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dim_trim.size(); ++i) {
+    if (x_dims[i + axis] != y_dim_trim[i]) {
+      VLOG(4) << "Fast broadcast chk fail, for dimension mismatch.";
+      return false;
+    }
+    (*n) *= y_dim_trim[i];
+  }
+  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+// function pointer
+template <class T>
+using FastBCastFn = void(const T* dinx,
+                         const T* diny,
+                         T* dout,
+                         int batch,
+                         int channels,
+                         int num,
+                         bool has_active,
+                         std::string act_mode,
+                         bool inv);
+
+template <class T>
+using ElementWiseFn = void(const T* dinx,
+                           const T* diny,
+                           T* dout,
+                           int num,
+                           bool has_active,
+                           std::string act_mode);
+
+template <class T>
+using BinaryOpFn = lite::kernels::host::BinaryOpFn<T>;
+
+template <class Elem_t, class DimValue_t, class LoongArchConfig>
+struct LoongArchCommonElementWise {
+  static void Run(
+      // todo: if necessary, generate
+      //  lite::kernels::host::StaticBatchElementWiseArg by
+      //  batch_arg->ToStaticArg() before kernel launch, it will help to reduce
+      //  runtime overhead.
+      const lite::kernels::host::BatchElementWiseArg<Elem_t, DimValue_t>&
+          batch_arg,
+      BinaryOpFn<Elem_t> op) {
+    int batch_num = batch_arg.BatchNum();
+    auto bcast_type = batch_arg.BcastType();
+    int range_length = batch_arg.ElemNumPerBatch();
+    switch (bcast_type) {
+      case (lite::kernels::host::BroadcastType::X_AS_CONTINUOUS): {
+        for (int batch_id = 0; batch_id < batch_num; ++batch_id) {
+          paddle::lite::loongarch::math::elementwise_range_to_one<LoongArchConfig>(
+              batch_arg.XAtBatch(batch_id),
+              batch_arg.YAtBatch(batch_id),
+              batch_arg.ZAtBatch(batch_id),
+              range_length);
+        }
+        break;
+      }
+      case (lite::kernels::host::BroadcastType::Y_AS_CONTINUOUS): {
+        for (int batch_id = 0; batch_id < batch_num; ++batch_id) {
+          paddle::lite::loongarch::math::elementwise_one_to_range<LoongArchConfig>(
+              batch_arg.XAtBatch(batch_id),
+              batch_arg.YAtBatch(batch_id),
+              batch_arg.ZAtBatch(batch_id),
+              range_length);
+        }
+        break;
+      }
+      case (lite::kernels::host::BroadcastType::BOTH_CONTINUOUS): {
+        for (int batch_id = 0; batch_id < batch_num; ++batch_id) {
+          paddle::lite::loongarch::math::elementwise_range_to_range<LoongArchConfig>(
+              batch_arg.XAtBatch(batch_id),
+              batch_arg.YAtBatch(batch_id),
+              batch_arg.ZAtBatch(batch_id),
+              range_length);
+        }
+        break;
+      }
+      default: {
+        LOG(FATAL) << "Un supported bcast type(isa)";
+        break;
+      }
+    }
+  }
+};
+
+template <class Elem_t, class DimValue_t>
+struct LoongArchCommonElementWise<Elem_t,
+                            DimValue_t,
+                            paddle::lite::loongarch::math::NullCpuInstruction> {
+  static void Run(
+      // todo: if necessary, generate
+      //  lite::kernels::host::StaticBatchElementWiseArg by
+      //  batch_arg->ToStaticArg() before kernel launch, it will help to reduce
+      //  runtime overhead.
+      const lite::kernels::host::BatchElementWiseArg<Elem_t, DimValue_t>&
+          batch_arg,
+      BinaryOpFn<Elem_t> op) {
+    int batch_num = batch_arg.BatchNum();
+    auto bcast_type = batch_arg.BcastType();
+    int range_length = batch_arg.ElemNumPerBatch();
+    switch (bcast_type) {
+      case (lite::kernels::host::BroadcastType::X_AS_CONTINUOUS): {
+        for (int batch_id = 0; batch_id < batch_num; ++batch_id) {
+          lite::kernels::host::element_wise_range_to_one<Elem_t>(
+              batch_arg.XAtBatch(batch_id),
+              batch_arg.YAtBatch(batch_id),
+              batch_arg.ZAtBatch(batch_id),
+              range_length,
+              op);
+        }
+        break;
+      }
+      case (lite::kernels::host::BroadcastType::Y_AS_CONTINUOUS): {
+        for (int batch_id = 0; batch_id < batch_num; ++batch_id) {
+          lite::kernels::host::element_wise_one_to_range<Elem_t>(
+              batch_arg.XAtBatch(batch_id),
+              batch_arg.YAtBatch(batch_id),
+              batch_arg.ZAtBatch(batch_id),
+              range_length,
+              op);
+        }
+        break;
+      }
+      case (lite::kernels::host::BroadcastType::BOTH_CONTINUOUS): {
+        for (int batch_id = 0; batch_id < batch_num; ++batch_id) {
+          lite::kernels::host::element_wise_range_to_range<Elem_t>(
+              batch_arg.XAtBatch(batch_id),
+              batch_arg.YAtBatch(batch_id),
+              batch_arg.ZAtBatch(batch_id),
+              range_length,
+              op);
+        }
+        break;
+      }
+      default: {
+        LOG(FATAL) << "Un supported bcast type(host)";
+        break;
+      }
+    }
+  }
+};
+
+template <class OpParamType, class T, class LoongArchConfig>
+void elementwise_compute_template(paddle::lite::KernelBase* kernel,
+                                  FastBCastFn<T> fast_bcast_fn,
+                                  ElementWiseFn<T> elementwise_fn,
+                                  BinaryOpFn<T> op,
+                                  bool has_active = false,
+                                  std::string act_type = "") {
+  auto& param = kernel->template Param<OpParamType>();
+  auto x = param.X;
+  auto y = param.Y;
+
+  auto* x_data = x->template data<T>();
+  auto* y_data = y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
+  int axis = param.axis;
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  int pre, n, post;
+
+  if (elementwise_fn && x_dims == y_dims) {
+    elementwise_fn(
+        x_data, y_data, out_data, x_dims.production(), has_active, act_type);
+  } else if (fast_bcast_fn &&
+             is_fast_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    fast_bcast_fn(
+        x_data, y_data, out_data, pre, n, post, has_active, act_type, false);
+  } else if (fast_bcast_fn && axis == -1 &&
+             is_fast_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    fast_bcast_fn(
+        x_data, y_data, out_data, pre, n, post, has_active, act_type, true);
+  } else {
+    auto batch_arg =
+        lite::kernels::host::GenBatchElementWiseArg<T>(x, y, param.Out, axis);
+    LoongArchCommonElementWise<T, int64_t, LoongArchConfig>::Run(batch_arg, op);
+  }
+  if (!elementwise_fn && !fast_bcast_fn) {
+    LOG(FATAL) << "unsupported elementwise_compute called";
+  }
+}
+
+#define ElementwiseOpCompute(op)                                              \
+  template <typename T>                                                       \
+  void Elementwise##op##Compute<T>::Run() {                                   \
+    using LoongArchConfig = paddle::lite::loongarch::math::MergeConfig<                   \
+        lite::loongarch::math::op##Config<T>,                                       \
+        lite::loongarch::math::ActiveConfig<lite::loongarch::math::ActiveType::NO_ACTIVE, \
+                                      T>>;                                    \
+    elementwise_compute_template<operators::ElementwiseParam, T, LoongArchConfig>(  \
+        this,                                                                 \
+        lite::loongarch::math::Elementwise_Broadcast_##op<T>,                       \
+        lite::loongarch::math::Elementwise_##op<T>,                                 \
+        lite::loongarch::math::Naive##op<T>);                                       \
+  }
+
+#define ElementwiseOpActivationCompute(op)                                    \
+  template <typename T>                                                       \
+  void Elementwise##op##ActivationCompute<T>::Run() {                         \
+    auto& param =                                                             \
+        this->template Param<operators::FusionElementwiseActivationParam>();  \
+    if (param.act_type == "relu") {                                           \
+      using LoongArchConfig = paddle::lite::loongarch::math::MergeConfig<                 \
+          lite::loongarch::math::op##Config<float>,                                 \
+          lite::loongarch::math::ActiveConfig<lite::loongarch::math::ActiveType::RELU,    \
+                                        float>>;                              \
+      elementwise_compute_template<                                           \
+          operators::FusionElementwiseActivationParam,                        \
+          float,                                                              \
+          LoongArchConfig>(this,                                                    \
+                     lite::loongarch::math::Elementwise_Broadcast_##op<float>,      \
+                     lite::loongarch::math::Elementwise_##op<float>,                \
+                     lite::loongarch::math::Naive##op<float>,                       \
+                     true,                                                    \
+                     param.act_type);                                         \
+    } else if (param.act_type == "tanh") {                                    \
+      using LoongArchConfig = paddle::lite::loongarch::math::MergeConfig<                 \
+          lite::loongarch::math::op##Config<float>,                                 \
+          lite::loongarch::math::ActiveConfig<lite::loongarch::math::ActiveType::TANH,    \
+                                        float>>;                              \
+      elementwise_compute_template<                                           \
+          operators::FusionElementwiseActivationParam,                        \
+          float,                                                              \
+          LoongArchConfig>(this,                                                    \
+                     lite::loongarch::math::Elementwise_Broadcast_##op<float>,      \
+                     lite::loongarch::math::Elementwise_##op<float>,                \
+                     lite::loongarch::math::Naive##op<float>,                       \
+                     true,                                                    \
+                     param.act_type);                                         \
+    } else if (param.act_type == "sigmoid") {                                 \
+      using LoongArchConfig = paddle::lite::loongarch::math::MergeConfig<                 \
+          lite::loongarch::math::op##Config<float>,                                 \
+          lite::loongarch::math::ActiveConfig<lite::loongarch::math::ActiveType::SIGMOID, \
+                                        float>>;                              \
+      elementwise_compute_template<                                           \
+          operators::FusionElementwiseActivationParam,                        \
+          float,                                                              \
+          LoongArchConfig>(this,                                                    \
+                     lite::loongarch::math::Elementwise_Broadcast_##op<float>,      \
+                     lite::loongarch::math::Elementwise_##op<float>,                \
+                     lite::loongarch::math::Naive##op<float>,                       \
+                     true,                                                    \
+                     param.act_type);                                         \
+    } else {                                                                  \
+      LOG(FATAL) << "unsupported active type:" << param.act_type;             \
+      using LoongArchConfig = paddle::lite::loongarch::math::MergeConfig<                 \
+          lite::loongarch::math::op##Config<float>,                                 \
+          lite::loongarch::math::                                                   \
+              ActiveConfig<lite::loongarch::math::ActiveType::NO_ACTIVE, float>>;   \
+      elementwise_compute_template<                                           \
+          operators::FusionElementwiseActivationParam,                        \
+          float,                                                              \
+          LoongArchConfig>(this,                                                    \
+                     lite::loongarch::math::Elementwise_Broadcast_##op<float>,      \
+                     lite::loongarch::math::Elementwise_##op<float>,                \
+                     lite::loongarch::math::Naive##op<float>,                       \
+                     true,                                                    \
+                     param.act_type);                                         \
+    }                                                                         \
+  }
+
+// clang-format off
+ElementwiseOpCompute(Add)
+ElementwiseOpActivationCompute(Add)
+ElementwiseOpCompute(Sub)
+ElementwiseOpActivationCompute(Sub)
+ElementwiseOpCompute(Mul)
+ElementwiseOpActivationCompute(Mul)
+ElementwiseOpCompute(Div)
+ElementwiseOpActivationCompute(Div)
+ElementwiseOpCompute(FloorDiv)
+ElementwiseOpActivationCompute(FloorDiv)
+ElementwiseOpCompute(Max)
+ElementwiseOpActivationCompute(Max)
+ElementwiseOpCompute(Min)
+ElementwiseOpActivationCompute(Min)
+ElementwiseOpCompute(Mod)
+ElementwiseOpActivationCompute(Mod)
+ElementwiseOpCompute(Pow)
+ElementwiseOpActivationCompute(Pow)
+// clang-format on
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseAddCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseAddCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseAddCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_add_activation,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseAddActivationCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseSubCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseSubCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseSubCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_sub_activation,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseSubActivationCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMulCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMulCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMulCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_mul_activation,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseMulActivationCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseDivCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseDivCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseDivCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_div_activation,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseDivActivationCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    elementwise_floordiv,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseFloorDivCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    elementwise_floordiv,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseFloorDivCompute<int32_t>,
+    int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    elementwise_floordiv,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseFloorDivCompute<int64_t>,
+    int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_pow,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwisePowCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_pow,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwisePowCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_pow,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwisePowCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mod,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseModCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mod,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseModCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_max,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMaxCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_max,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMaxCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_max,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMaxCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_max_activation,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseMaxActivationCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_min_activation,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::ElementwiseMinActivationCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_min,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMinCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_min,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMinCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_min,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ElementwiseMinCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/elementwise_compute.h b/lite/kernels/loongarch/elementwise_compute.h
new file mode 100644
index 00000000000..f16d8bf2776
--- /dev/null
+++ b/lite/kernels/loongarch/elementwise_compute.h
@@ -0,0 +1,190 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+template <typename T>
+class ElementwiseAddActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseAddActivationCompute() = default;
+};
+
+template <typename T>
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseSubCompute() = default;
+};
+
+template <typename T>
+class ElementwiseSubActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseSubActivationCompute() = default;
+};
+
+template <typename T>
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMulCompute() = default;
+};
+
+template <typename T>
+class ElementwiseMulActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMulActivationCompute() = default;
+};
+
+template <typename T>
+class ElementwiseMaxCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMaxCompute() = default;
+};
+
+template <typename T>
+class ElementwiseMaxActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMaxActivationCompute() = default;
+};
+
+template <typename T>
+class ElementwiseMinCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMinCompute() = default;
+};
+
+template <typename T>
+class ElementwiseMinActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMinActivationCompute() = default;
+};
+
+template <typename T>
+class ElementwiseDivCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseDivCompute() = default;
+};
+
+template <typename T>
+class ElementwiseDivActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseDivActivationCompute() = default;
+};
+
+template <typename T>
+class ElementwiseFloorDivCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseFloorDivCompute() = default;
+};
+
+template <typename T>
+class ElementwiseFloorDivActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseFloorDivActivationCompute() = default;
+};
+
+template <typename T>
+class ElementwiseModCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseModCompute() = default;
+};
+
+template <typename T>
+class ElementwiseModActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseModActivationCompute() = default;
+};
+
+template <typename T>
+class ElementwisePowCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwisePowCompute() = default;
+};
+
+template <typename T>
+class ElementwisePowActivationCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwisePowActivationCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/elementwise_op_function.h b/lite/kernels/loongarch/elementwise_op_function.h
new file mode 100644
index 00000000000..6ab8d823342
--- /dev/null
+++ b/lite/kernels/loongarch/elementwise_op_function.h
@@ -0,0 +1,805 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/fluid/for_range.h"
+#include "lite/backends/loongarch/fluid/transform.h"
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/utils/log/cp_logging.h"
+#include "lite/utils/variant.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
+ *
+ * New parameter: *mid_flag* is added to solve m*n*k & m*1*k
+ * broadcast cases.
+ */
+inline void get_mid_dims(const lite::DDim &x_dims,
+                         const lite::DDim &y_dims,
+                         const int axis,
+                         int *pre,
+                         int *n,
+                         int *post,
+                         int *mid_flag = NULL) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  if (mid_flag != NULL) {
+    *mid_flag = 0;
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= x_dims[i];
+    }
+    for (size_t i = 0; i < y_dims.size(); ++i) {
+      if (x_dims[i + axis] != y_dims[i]) {
+        CHECK_EQ((y_dims[i] == 1 || x_dims[i + axis] == 1), true)
+            << "Broadcast y or x dimension is not 1.";
+        *mid_flag = 1;
+        return;
+      }
+      (*n) *= y_dims[i];
+    }
+    for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+      (*post) *= x_dims[i];
+    }
+  } else {  // for fused_elementwise_activation_op. keep the old version.
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= x_dims[i];
+    }
+
+    for (size_t i = 0; i < y_dims.size(); ++i) {
+      CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
+      (*n) *= y_dims[i];
+    }
+
+    for (size_t i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+      (*post) *= x_dims[i];
+    }
+  }
+}
+
+inline lite::DDim trim_trailing_singular_dims(const lite::DDim &dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  if (actual_dims_size == dims.size()) return dims;
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (size_t i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return lite::DDim();
+  }
+  lite::DDim actual_dims = lite::DDim(trim_dims);
+  return actual_dims;
+}
+
+template <typename T, lite::TargetType Target>
+class RowwiseTransformIterator;
+
+template <typename T, lite::TargetType Target>
+class MidWiseTransformIterator;
+
+// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
+template <typename T>
+class RowwiseTransformIterator<T, lite::TargetType::kLoongArch>
+    : public std::iterator<std::random_access_iterator_tag,
+                           T,
+                           std::ptrdiff_t,
+                           T *,
+                           T &> {
+ public:
+  RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
+
+  RowwiseTransformIterator<T, lite::TargetType::kLoongArch> &operator++() {
+    ++i_;
+    if (UNLIKELY(i_ == n_)) {
+      i_ = 0;
+    }
+    return *this;
+  }
+
+  RowwiseTransformIterator<T, lite::TargetType::kLoongArch> &operator+(int n) {
+    while (n-- > 0) {
+      ++i_;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+
+    return *this;
+  }
+
+  bool operator==(
+      const RowwiseTransformIterator<T, lite::TargetType::kLoongArch> &rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(
+      const RowwiseTransformIterator<T, lite::TargetType::kLoongArch> &rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T &operator*() { return ptr_[i_]; }
+
+ private:
+  const T *ptr_;
+  int i_;
+  int64_t n_;
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, lite::TargetType::kLoongArch>
+    : public std::iterator<std::random_access_iterator_tag,
+                           T,
+                           std::ptrdiff_t,
+                           T *,
+                           T &> {
+ public:
+  MidWiseTransformIterator(const T *ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+  MidWiseTransformIterator<T, lite::TargetType::kLoongArch> &operator++() {
+    ++j_;
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
+      j_ = 0;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+    return *this;
+  }
+
+  MidWiseTransformIterator<T, lite::TargetType::kLoongArch> &operator+(int n) {
+    while (n-- > 0) {
+      ++j_;
+      if (UNLIKELY(j_ == post_)) {
+        ++i_;
+        j_ = 0;
+        if (UNLIKELY(i_ == n_)) {
+          i_ = 0;
+        }
+      }
+    }
+    return *this;
+  }
+
+  bool operator==(
+      const MidWiseTransformIterator<T, lite::TargetType::kLoongArch> &rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(
+      const MidWiseTransformIterator<T, lite::TargetType::kLoongArch> &rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T &operator*() { return ptr_[i_]; }
+
+ private:
+  const T *ptr_;
+  int64_t i_;
+  int64_t j_;
+  int64_t n_;
+  int64_t post_;
+};
+
+template <typename Functor,
+          typename T,
+          lite::TargetType Target,
+          typename OutType = T>
+class TransformFunctor {
+ public:
+  TransformFunctor(const lite::Tensor *x,
+                   const lite::Tensor *y,
+                   lite::Tensor *z,
+                   const lite::Context<Target> &ctx,
+                   Functor func,
+                   const bool is_xsize_larger = true)
+      : x_(x->template data<T>()),
+        y_(y->template data<T>()),
+        z_(z->mutable_data<OutType>()),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func),
+        is_xsize_larger_(is_xsize_larger) {
+    if (is_xsize_larger_ == false) {
+      nx_ = y->numel();
+    }
+  }
+
+  inline void Run() const {
+    lite::fluid::Transform<Target> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+
+  inline void RunRowWise(int n, int pre) const {
+    lite::fluid::Transform<Target> trans;
+    if (is_xsize_larger_) {
+      trans(ctx_,
+            x_,
+            x_ + nx_,
+            RowwiseTransformIterator<T, Target>(y_, n),
+            z_,
+            func_);
+    } else {
+      trans(ctx_,
+            y_,
+            y_ + nx_,
+            RowwiseTransformIterator<T, Target>(x_, n),
+            z_,
+            func_);
+    }
+  }
+
+  inline void RunMidWise(int n, int pre, int post) const {
+    lite::fluid::Transform<Target> trans;
+    if (is_xsize_larger_) {
+      trans(ctx_,
+            x_,
+            x_ + nx_,
+            MidWiseTransformIterator<T, Target>(y_, n, post),
+            z_,
+            func_);
+    } else {
+      trans(ctx_,
+            y_,
+            y_ + nx_,
+            MidWiseTransformIterator<T, Target>(x_, n, post),
+            z_,
+            func_);
+    }
+  }
+
+ private:
+  const T *x_;
+  const T *y_;
+  OutType *z_;
+  int64_t nx_;
+  const lite::Context<Target> &ctx_;
+  Functor func_;
+  bool is_xsize_larger_;
+};
+
+inline void GetBroadcastDimsArrays(const DDim &x_dims,
+                                   const DDim &y_dims,
+                                   int *x_dims_array,
+                                   int *y_dims_array,
+                                   int *out_dims_array,
+                                   const int max_dim,
+                                   const int axis) {
+  CHECK_GE(axis, 0) << "Axis should be great than or equal to 0.";
+  CHECK_LT(axis, max_dim) << "Axis should be less than max(x_dim, y_dim).";
+
+  if (x_dims.size() > y_dims.size()) {
+    std::fill(y_dims_array, y_dims_array + axis, 1);
+    if (axis + y_dims.size() < max_dim) {
+      std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1);
+    }
+    for (int i = 0; i < x_dims.size(); i++) x_dims_array[i] = x_dims[i];
+    for (int i = 0; i < y_dims.size(); i++)
+      *(y_dims_array + axis + i) = y_dims[i];
+  } else {
+    std::fill(x_dims_array, x_dims_array + axis, 1);
+    if (axis + x_dims.size() < max_dim) {
+      std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1);
+    }
+    for (int i = 0; i < x_dims.size(); i++)
+      *(x_dims_array + axis + i) = x_dims[i];
+    for (int i = 0; i < y_dims.size(); i++) *(y_dims_array + i) = y_dims[i];
+  }
+
+  for (int i = 0; i < max_dim; i++) {
+    CHECK_EQ((x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
+              y_dims_array[i] <= 1),
+             true)
+        << "Broadcast dimension mismatch. Operands could not be broadcast.";
+
+    if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) ||
+        (x_dims_array[i] == 1 && y_dims_array[i] == 1)) {
+      out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+    } else {
+      out_dims_array[i] = -1;
+    }
+  }
+}
+
+inline int GetElementwiseIndex(const int *x_dims_array,
+                               const int max_dim,
+                               const int *index_array) {
+  int index_ = 0;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] > 1) {
+      index_ = index_ * x_dims_array[i] + index_array[i];
+    }
+  }
+  return index_;
+}
+
+inline void UpdateElementwiseIndexArray(const int *out_dims_array,
+                                        const int max_dim,
+                                        int *index_array) {
+  for (int i = max_dim - 1; i >= 0; --i) {
+    ++index_array[i];
+    if (index_array[i] >= out_dims_array[i]) {
+      index_array[i] -= out_dims_array[i];
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void CommonForwardBroadcastCPU(const Tensor *x,
+                               const Tensor *y,
+                               Tensor *z,
+                               int *x_dims_array,
+                               int *y_dims_array,
+                               int *out_dims_array,
+                               int max_dim,
+                               Functor func,
+                               const bool is_xsize_larger = true) {
+  std::vector<int> index_array(max_dim, 0);
+  const T *x_data = x->data<T>();
+  const T *y_data = y->data<T>();
+  CHECK_EQ((x_data != nullptr), true) << "The input X should not be empty.";
+  CHECK_EQ((y_data != nullptr), true) << "The input Y should not be empty.";
+
+  OutType *out_data = z->mutable_data<OutType>();
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (is_xsize_larger) {
+      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
+    } else {
+      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
+  }
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void CommonElementwiseBroadcastForward(const Tensor *x,
+                                       const Tensor *y,
+                                       Tensor *z,
+                                       const DDim &x_dims,
+                                       const DDim &y_dims,
+                                       Functor func,
+                                       int axis,
+                                       const bool is_xsize_larger = true) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(static_cast<int>(x_dims.size()) -
+                                static_cast<int>(y_dims.size()))
+                     : axis);
+  CHECK_GE(axis, 0) << "Axis should be great than or equal to 0.";
+  CHECK_LT(axis, max_dim) << "Axis should be less than max(x_dim, y_dim).";
+
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
+
+  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
+                                                 y,
+                                                 z,
+                                                 x_dims_array.data(),
+                                                 y_dims_array.data(),
+                                                 out_dims_array.data(),
+                                                 max_dim,
+                                                 func,
+                                                 is_xsize_larger);
+}
+
+template <typename Functor,
+          lite::TargetType Target,
+          typename T,
+          typename OutType = T>
+void ElementwiseComputeEx(const lite::Context<Target> &ctx,
+                          const lite::Tensor *x,
+                          const lite::Tensor *y,
+                          int axis,
+                          Functor func,
+                          lite::Tensor *z) {
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  bool is_xsize_larger = true;
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+  TransformFunctor<Functor, T, Target, OutType> functor(
+      x, y, z, ctx, func, is_xsize_larger);
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  int tmp = std::abs(static_cast<int>(x_dims.size()) -
+                     static_cast<int>(y_dims.size()));
+  axis = (axis == static_cast<int>(-1) ? tmp : axis);
+
+  CHECK_GE(axis, 0) << "Axis should be great than or equal to 0.";
+  CHECK_LT(axis, max_dim) << "Axis should be less than max(x_dim, y_dim).";
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = trim_trailing_singular_dims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    get_mid_dims(x_dims,
+                 y_dims_trimed,
+                 axis_trim,
+                 &pre,
+                 &n,
+                 &post,
+                 &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = trim_trailing_singular_dims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    get_mid_dims(y_dims,
+                 x_dims_trimed,
+                 axis_trim,
+                 &pre,
+                 &n,
+                 &post,
+                 &is_run_common_broadcast);
+  }
+  // special case for common implementation.
+  // case 1: x=[2,3,1,5], y=[2,1,4,1]
+  // case 2: x=[2,3,4], y=[1,1,4]
+  if (is_run_common_broadcast == 1) {
+    CommonElementwiseBroadcastForward<Functor, T, OutType>(
+        x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
+    return;
+  }
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
+  }
+}
+
+// FusedElemwiseAndAct
+// --- forward
+template <typename T, typename CompoundFunctor, bool KeepIntermediateOut>
+struct FusedElemwiseAndActNoBroadcast {
+  HOSTDEVICE void operator()(size_t i) {
+    T y_val = y_[i];
+    T x_val = x_[i];
+    if (KeepIntermediateOut) {
+      T intermeidiate_out = compound_functor_.GetIntermediateOut(x_val, y_val);
+      intermediate_out_[i] = intermeidiate_out;
+      out_[i] =
+          compound_functor_.GetOutUseIntermediateOut(x_val, intermeidiate_out);
+    } else {
+      out_[i] = compound_functor_.GetOut(x_val, y_val);
+    }
+  }
+
+  const T *x_;
+  const T *y_;
+  CompoundFunctor compound_functor_;
+  T *out_;
+  T *intermediate_out_;
+};
+
+// FusedElemwiseAndActBroadcast1:
+// In this case, X and Y can be reshaped to a matrix.
+// For example shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) and axis = -1 or 2,
+// X can be reshaped to (6, 20) and Y can be reshaped to (1, 20)
+template <typename T,
+          typename CompoundFunctor,
+          bool BcastY,
+          bool KeepIntermediateOut,
+          bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActBroadcast1CPU(const T *x,
+                                             const T *y,
+                                             CompoundFunctor compound_functor,
+                                             int h,
+                                             int w,
+                                             T *out,
+                                             T *intermediate_out) {
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      int offset = i * w + j;
+
+      T y_val = BcastY ? y[j] : y[offset];
+      T x_val = BcastY ? x[offset] : x[j];
+      int64_t intermediate_out_offset;
+      if (KeepIntermediateOut) {
+        T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val);
+
+        if (SameShapeOfIntermediateOutAndOut) {
+          // for the case of f1(f2(x, y))
+          intermediate_out_offset = offset;
+        } else if (BcastY) {
+          intermediate_out_offset = j;
+        } else {
+          intermediate_out_offset = offset;
+        }
+
+        intermediate_out[intermediate_out_offset] = intermeidiate_out;
+        out[offset] =
+            compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out);
+      } else {
+        out[offset] = compound_functor.GetOut(x_val, y_val);
+      }
+    }
+  }
+}
+
+// FusedElemwiseAndActBroadcast2
+// In this case, X and Y can be reshaped to a matrix.
+// For example shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4) and axis = 1,
+// X can be reshaped to (2, 12, 5) and Y can be reshaped to (1, 12, 1)
+// pre = 2, n = 12, post = 5
+template <typename T,
+          typename CompoundFunctor,
+          bool BcastY,
+          bool KeepIntermediateOut,
+          bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActBroadcast2CPU(const T *x,
+                                             const T *y,
+                                             int pre,
+                                             int n,
+                                             int post,
+                                             CompoundFunctor compound_functor,
+                                             T *out,
+                                             T *intermediate_out) {
+  for (int i = 0; i < pre; ++i) {
+    for (int j = 0; j < n; ++j) {
+      for (int k = 0; k < post; ++k) {
+        int offset = i * n * post + j * post + k;
+
+        T y_val = BcastY ? y[j] : y[offset];
+        T x_val = BcastY ? x[offset] : x[j];
+        int64_t intermediate_out_offset;
+
+        if (KeepIntermediateOut) {
+          T intermeidiate_out =
+              compound_functor.GetIntermediateOut(x_val, y_val);
+
+          if (SameShapeOfIntermediateOutAndOut) {
+            // for the case of f1(f2(x, y))
+            intermediate_out_offset = offset;
+          } else if (BcastY) {
+            intermediate_out_offset = j;
+          } else {
+            intermediate_out_offset = offset;
+          }
+
+          intermediate_out[intermediate_out_offset] = intermeidiate_out;
+          out[offset] = compound_functor.GetOutUseIntermediateOut(
+              x_val, intermeidiate_out);
+        } else {
+          out[offset] = compound_functor.GetOut(x_val, y_val);
+        }
+      }
+    }
+  }
+}
+
+template <lite::TargetType Target,
+          typename T,
+          typename CompoundFunctor,
+          bool KeepIntermediateOut>
+void FusedElemwiseAndActComputeNoBroadcast(const lite::Context<Target> &ctx,
+                                           const lite::DDim &x_dim,
+                                           const lite::Tensor &x,
+                                           const lite::Tensor &y,
+                                           CompoundFunctor compound_functor,
+                                           lite::Tensor *out,
+                                           lite::Tensor *intermediate_out) {
+  size_t N = static_cast<size_t>(x_dim.production());
+
+  lite::fluid::ForRange<Target> for_range(ctx, N);
+
+  for_range(
+      FusedElemwiseAndActNoBroadcast<T, CompoundFunctor, KeepIntermediateOut>{
+          x.data<T>(),
+          y.data<T>(),
+          compound_functor,
+          out->template mutable_data<T>(),
+          intermediate_out == nullptr
+              ? nullptr
+              : intermediate_out->template mutable_data<T>()});
+}
+
+template <lite::TargetType Target,
+          typename T,
+          typename CompoundFunctor,
+          bool BcastY,
+          bool KeepIntermediateOut,
+          bool SameShapeOfIntermediateOutAndOut>
+void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
+                                             const lite::DDim &x_dim,
+                                             const lite::DDim &y_dim_untrimed,
+                                             const lite::Tensor &x,
+                                             const lite::Tensor &y,
+                                             CompoundFunctor compound_functor,
+                                             int axis,
+                                             lite::Tensor *out,
+                                             lite::Tensor *intermediate_out) {
+  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
+  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
+  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+  int pre, n, post;
+  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+
+  if (post == 1) {
+    int h = pre;
+    int w = n;
+    FusedElemwiseAndActBroadcast1CPU<T,
+                                     CompoundFunctor,
+                                     BcastY,
+                                     KeepIntermediateOut,
+                                     SameShapeOfIntermediateOutAndOut>(
+        x.data<T>(),
+        y.data<T>(),
+        compound_functor,
+        h,
+        w,
+        out->template mutable_data<T>(),
+        intermediate_out == nullptr
+            ? nullptr
+            : intermediate_out->template mutable_data<T>());
+
+  } else {
+    FusedElemwiseAndActBroadcast2CPU<T,
+                                     CompoundFunctor,
+                                     BcastY,
+                                     KeepIntermediateOut,
+                                     SameShapeOfIntermediateOutAndOut>(
+        x.data<T>(),
+        y.data<T>(),
+        pre,
+        n,
+        post,
+        compound_functor,
+        out->template mutable_data<T>(),
+        intermediate_out == nullptr
+            ? nullptr
+            : intermediate_out->template mutable_data<T>());
+  }
+}
+
+template <lite::TargetType Target,
+          typename T,
+          typename CompoundFunctor,
+          bool KeepIntermediateOut,
+          bool SameShapeOfIntermediateOutAndOut>
+void FusedElemwiseAndActComputeEx(const lite::Context<Target> &ctx,
+                                  const lite::Tensor &x,
+                                  const lite::Tensor &y,
+                                  int axis,
+                                  CompoundFunctor compound_functor,
+                                  lite::Tensor *out,
+                                  lite::Tensor *intermediate_out) {
+  if (KeepIntermediateOut) {
+    CHECK(intermediate_out) << "The save_intermediate_out is opened, "
+                               "intermediate_out should not be nullptr.";
+  }
+
+  const lite::DDim &x_dim = x.dims();
+  const lite::DDim &y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    FusedElemwiseAndActComputeNoBroadcast<Target,
+                                          T,
+                                          CompoundFunctor,
+                                          KeepIntermediateOut>(
+        ctx, x_dim, x, y, compound_functor, out, intermediate_out);
+  } else {
+    // Whether the shape of Y is a continuous subsequence of X,
+    // For more information please refer to the op's introduction.
+    bool bcast_y = x.dims().size() >= y.dims().size();
+    if (x.dims().size() == y.dims().size()) {
+      for (int i = 0; i < x.dims().size(); ++i) {
+        if (x.dims()[i] < y.dims()[i]) {
+          bcast_y = false;
+          break;
+        }
+      }
+    }
+
+    // z = f1(x, f2(y))
+    // z = f1(f2(x, y))
+    if (bcast_y) {  // Y should be broadcast.
+      // In this case,
+      // for 'f2(y)', the shape of intermediate_out should be equal to the
+      // shape
+      // of Y.
+      // for 'f2(x, y)', the shape of intermediate_out should be equal to the
+      // shape of Out.
+      // the shape of Out should be equal to the shape of X.
+      FusedElemwiseAndActComputeWithBroadcast<Target,
+                                              T,
+                                              CompoundFunctor,
+                                              true /*BcastY*/,
+                                              KeepIntermediateOut,
+                                              SameShapeOfIntermediateOutAndOut>(
+          ctx,
+          x_dim /*OutShape*/,
+          y_dim,
+          x,
+          y,
+          compound_functor,
+          axis,
+          out,
+          intermediate_out);
+    } else {
+      // In this case,
+      // for 'f2(y)', the shape of intermediate_out should be equal to the
+      // shape
+      // of Out.
+      // for 'f2(x, y)', the shape of intermediate_out should be equal to the
+      // shape of Out.
+      // the shape of Out should be equal to the shape of Y.
+      FusedElemwiseAndActComputeWithBroadcast<Target,
+                                              T,
+                                              CompoundFunctor,
+                                              false /*BcastY*/,
+                                              KeepIntermediateOut,
+                                              SameShapeOfIntermediateOutAndOut>(
+          ctx,
+          y_dim /*OutShape*/,
+          x_dim,
+          x,
+          y,
+          compound_functor,
+          axis,
+          out,
+          intermediate_out);
+    }
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/fc_compute.cc b/lite/kernels/loongarch/fc_compute.cc
new file mode 100644
index 00000000000..d11a1af7718
--- /dev/null
+++ b/lite/kernels/loongarch/fc_compute.cc
@@ -0,0 +1,308 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/fc_compute.h"
+#include "lite/backends/loongarch/math/gemm_s8u8_compute.h"
+#include "lite/backends/loongarch/math/saturate.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+#define GEMM_OUT_INT8                                                     \
+  lite::loongarch::math::generate_gemm_s8u8_loongarch_kern<int8_t> gemm(false,        \
+                                                            false,        \
+                                                            m,            \
+                                                            n,            \
+                                                            k,            \
+                                                            i_data,       \
+                                                            n,            \
+                                                            w_scale,      \
+                                                            input_scale,  \
+                                                            output_scale, \
+                                                            b_data,       \
+                                                            relu_type,    \
+                                                            1.f);
+
+#define GEMM_OUT_FLOAT                                                   \
+  lite::loongarch::math::generate_gemm_s8u8_loongarch_kern<float> gemm(false,        \
+                                                           false,        \
+                                                           m,            \
+                                                           n,            \
+                                                           k,            \
+                                                           i_data,       \
+                                                           n,            \
+                                                           w_scale,      \
+                                                           input_scale,  \
+                                                           output_scale, \
+                                                           b_data,       \
+                                                           relu_type,    \
+                                                           1.f);
+
+template <lite::TargetType Target, typename T>
+class FCFunctor {
+ public:
+  void operator()(const lite::LoongArchContext& context,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T* X,
+                  const T* W,
+                  T* Y,
+                  const T* B = nullptr,
+                  bool relu = false,
+                  bool padding_weights = false) {
+    auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    T* Y1_data = nullptr;
+
+    auto compute =
+        relu
+            ? jit::KernelFuncs<jit::VAddReluTuple<T>, fluid::CPUPlace>::Cache()
+                  .At(N)
+            : jit::KernelFuncs<jit::VAddTuple<T>, fluid::CPUPlace>::Cache().At(
+                  N);
+    auto parallel_compute = [&](int64_t begin, int64_t end) {
+      for (int64_t i = begin; i < end; i++) {
+        T* dst = Y + i * N;
+        T* src = Y1_data ? Y1_data + i * (N + 4) : dst;
+        compute(B, src, dst, N);
+      }
+    };
+
+    // Because of the overhead of memcpy, we only do padding for GEMM
+    //  when weights is already padded in fc_fuse_pass.
+    if (padding_weights) {
+      const int NN = N + 4;
+      const int KK = K + 4;
+
+      // NOTE: here need to mutable_data for temporary Tensor X1 and Y1,
+      //  the overhead is unmeasured.
+      lite::Tensor X1;
+      X1.Resize(std::vector<int64_t>{M * KK});
+      T* X1_data = X1.mutable_data<T>();
+
+      lite::Tensor Y1;
+      Y1.Resize(std::vector<int64_t>{M * NN});
+      Y1_data = Y1.mutable_data<T>();
+
+      auto parallel_memcpy_x = [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
+          memcpy(X1_data + i * KK, X + i * K, K * sizeof(T));
+        }
+      };
+
+      parallel_memcpy_x(0, M);
+
+      blas.GEMM(false,
+                false,
+                M,
+                N,
+                K,
+                static_cast<T>(1.0),
+                X1_data,
+                KK,
+                W,
+                NN,
+                static_cast<T>(0.0),
+                Y1_data,
+                NN);
+
+      if (!B) {
+        auto parallel_memcpy_y = [&](int64_t begin, int64_t end) {
+          for (int64_t i = begin; i < end; i++) {
+            memcpy(Y + i * N, Y1_data + i * NN, N * sizeof(T));
+          }
+        };
+        parallel_memcpy_y(0, M);
+        return;
+      }
+      parallel_compute(0, M);
+    } else {
+      blas.MatMul(M, N, K, X, W, Y);
+      if (!B) {
+        return;
+      }
+      parallel_compute(0, M);
+    }
+  }
+};
+
+template <>
+void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  auto* input = param.input;
+  auto* w = param.w;
+  auto* bias = param.bias;
+  auto* output = param.output;
+  bool with_relu = (param.activation_type == "relu") ? true : false;
+
+  bool padding_weights = param.padding_weights;
+  const auto& w_dims = w->dims();
+  auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
+  auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
+
+  int M = output->dims().production() / w_dims1;
+
+  const float* input_data = input->template data<float>();
+  const float* w_data = w->template data<float>();
+  float* output_data = output->template mutable_data<float>();
+
+  auto& context = ctx_->As<LoongArchContext>();
+  FCFunctor<lite::TargetType::kLoongArch, float> fc;
+  fc(context,
+     M,
+     w_dims1,
+     w_dims0,
+     input_data,
+     w_data,
+     output_data,
+     bias ? bias->template data<float>() : NULL,
+     with_relu,
+     padding_weights);
+}
+
+template <>
+void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
+  auto& param = this->Param<operators::FcParam>();
+  auto* i_data = param.input->data<int8_t>();
+  auto* o_data = param.output->mutable_data<int8_t>();
+  auto* w_data = param.w->data<int8_t>();
+  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  auto w_dims = param.w->dims();
+  int k = w_dims[0];
+  int n = w_dims[1];
+  int m = param.output->dims().production() / n;
+  float input_scale = param.input_scale;
+  float output_scale = param.output_scale;
+  int relu_type = (param.activation_type == "relu") ? 1 : 0;
+  float* w_scale =
+      static_cast<float*>(TargetMalloc(TARGET(kLoongArch), m * sizeof(float)));
+
+  if (param.activation_type != "" && param.activation_type != "relu")
+    LOG(FATAL) << "not support fuse activation except relu.";
+
+  if (param.weight_scale.size() == 1) {
+    for (int i = 0; i < m; i++) w_scale[i] = param.weight_scale[0];
+    GEMM_OUT_INT8;
+    gemm.compute(i_data, w_data, o_data);
+  } else if (param.weight_scale.size() == m) {
+    for (int i = 0; i < m; i++) w_scale[i] = param.weight_scale[i];
+    GEMM_OUT_INT8;
+    gemm.compute(i_data, w_data, o_data);
+  } else if (param.weight_scale.size() == n) {
+    for (int i = 0; i < m; i++) w_scale[i] = 1.f;
+    float* tmp_output =
+        static_cast<float*>(TargetMalloc(TARGET(kLoongArch), m * n * sizeof(float)));
+    GEMM_OUT_FLOAT;
+    gemm.compute(i_data, w_data, tmp_output);
+    for (int nn = 0; nn < n; nn++) {
+      float tmp_scale = param.weight_scale[nn] / output_scale;
+      for (int mm = 0; mm < m; mm++) {
+        int offt = mm * n + nn;
+        o_data[offt] = lite::loongarch::math::saturate_cast<int8_t>(
+            roundf(tmp_output[offt] * tmp_scale));
+        o_data[offt] = o_data[offt] < -127 ? -127 : o_data[offt];
+      }
+    }
+    TargetFree(TARGET(kLoongArch), tmp_output);
+  } else {
+    LOG(FATAL) << "weight scale size is not 1, N or M, not support yet.";
+  }
+  TargetFree(TARGET(kLoongArch), w_scale);
+}
+
+template <>
+void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<operators::FcParam>();
+  auto* i_data = param.input->data<int8_t>();
+  auto* o_data = param.output->mutable_data<float>();
+  auto* w_data = param.w->data<int8_t>();
+  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  auto w_dims = param.w->dims();
+  int k = w_dims[0];
+  int n = w_dims[1];
+  int m = param.output->dims().production() / n;
+  int relu_type = (param.activation_type == "relu") ? 1 : 0;
+  float input_scale = param.input_scale;
+  float output_scale = param.output_scale;
+  float* w_scale =
+      static_cast<float*>(TargetMalloc(TARGET(kLoongArch), m * sizeof(float)));
+
+  if (param.activation_type != "" && param.activation_type != "relu")
+    LOG(FATAL) << "not support fuse activation except relu.";
+
+  if (param.weight_scale.size() == 1) {
+    for (int i = 0; i < m; i++) w_scale[i] = param.weight_scale[0];
+    GEMM_OUT_FLOAT;
+    gemm.compute(i_data, w_data, o_data);
+  } else if (param.weight_scale.size() == m) {
+    for (int i = 0; i < m; i++) w_scale[i] = param.weight_scale[i];
+    GEMM_OUT_FLOAT;
+    gemm.compute(i_data, w_data, o_data);
+  } else if (param.weight_scale.size() == n) {
+    for (int i = 0; i < m; i++) w_scale[i] = 1.f;
+    GEMM_OUT_FLOAT;
+    gemm.compute(i_data, w_data, o_data);
+    for (int nn = 0; nn < n; nn++) {
+      float tmp_scale = param.weight_scale[nn];
+      for (int mm = 0; mm < m; mm++) {
+        int offt = mm * n + nn;
+        o_data[offt] = o_data[offt] * tmp_scale;
+      }
+    }
+  } else {
+    LOG(FATAL) << "weight scale size is not 1, N or M, not support yet.";
+  }
+  TargetFree(TARGET(kLoongArch), w_scale);
+}
+
+#undef GEMM_OUT_INT8
+#undef GEMM_OUT_FLOAT
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::loongarch::FcCompute<PRECISION(kFloat),
+                                              PRECISION(kFloat)>
+    FcCompute_FP32;
+typedef paddle::lite::kernels::loongarch::FcCompute<PRECISION(kInt8),
+                                              PRECISION(kFloat)>
+    FcCompute_int8_fp32;
+typedef paddle::lite::kernels::loongarch::FcCompute<PRECISION(kInt8),
+                                              PRECISION(kInt8)>
+    FcCompute_int8_int8;
+
+REGISTER_LITE_KERNEL(fc, kLoongArch, kFloat, kNCHW, FcCompute_FP32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fc, kLoongArch, kInt8, kNCHW, FcCompute_int8_int8, int8_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fc, kLoongArch, kInt8, kNCHW, FcCompute_int8_fp32, fp32_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/fc_compute.h b/lite/kernels/loongarch/fc_compute.h
new file mode 100644
index 00000000000..8a7e1eeb3f3
--- /dev/null
+++ b/lite/kernels/loongarch/fc_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/jit/helper.h"
+#include "lite/backends/loongarch/jit/kernel_base.h"
+#include "lite/backends/loongarch/jit/kernels.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/fc_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <PrecisionType PType, PrecisionType OutType>
+class FcCompute : public KernelLite<TARGET(kLoongArch), PType> {
+ public:
+  using param_t = operators::FcParam;
+
+  virtual void Run();
+
+  virtual ~FcCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/gather_compute.cc b/lite/kernels/loongarch/gather_compute.cc
new file mode 100644
index 00000000000..adc2d6d14f3
--- /dev/null
+++ b/lite/kernels/loongarch/gather_compute.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/gather_compute.h"
+
+REGISTER_LITE_KERNEL(gather, kLoongArch, kFloat, kNCHW, GatherInt32Int32, int32int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Axis", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gather, kLoongArch, kFloat, kNCHW, GatherInt64Int64, int64int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Axis", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .Finalize();
+REGISTER_LITE_KERNEL(gather, kLoongArch, kFloat, kNCHW, GatherInt64Int32, int64int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Axis", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .Finalize();
+REGISTER_LITE_KERNEL(gather, kLoongArch, kFloat, kNCHW, GatherInt32Int64, int32int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Axis", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/gather_compute.h b/lite/kernels/loongarch/gather_compute.h
new file mode 100644
index 00000000000..2340b650cbb
--- /dev/null
+++ b/lite/kernels/loongarch/gather_compute.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/kernels/host/gather_compute.h"
diff --git a/lite/kernels/loongarch/grid_sampler_compute.cc b/lite/kernels/loongarch/grid_sampler_compute.cc
new file mode 100644
index 00000000000..620ce7ee563
--- /dev/null
+++ b/lite/kernels/loongarch/grid_sampler_compute.cc
@@ -0,0 +1,353 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/grid_sampler_compute.h"
+#include <string>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = lite::fluid::EigenTensor<T, D, MajorType, IndexType>;
+
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+template <typename T>
+inline bool IsInBound(T x, T y, T x_max, T y_max) {
+  return !(x < static_cast<T>(0) || x > x_max || y < static_cast<T>(0) ||
+           y > y_max);
+}
+
+template <typename T>
+void Unnormalize(const LoongArchContext& ctx,
+                 Tensor* grid_slice,
+                 const int max_val,  // height-1 or width-1
+                 bool align_corners) {
+  auto place = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
+template <typename T>
+void Clip(const LoongArchContext& ctx,
+          Tensor* grid_slice,
+          const int max_val,  // height-1 or width-1
+          bool align_corners,
+          std::string padding_mode) {
+  auto place = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
+template <class T>
+void CalcGridLocations(const LoongArchContext& ctx,
+                       const Tensor& grid,
+                       const int in_h,
+                       const int in_w,
+                       bool align_corners,
+                       std::string padding_mode,
+                       Tensor* grid_x,
+                       Tensor* grid_y) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  DDim grid_dim{{n, out_h, out_w}};
+  grid_x->Resize(grid_dim);
+  grid_y->Resize(grid_dim);
+  T* grid_x_data = grid_x->template mutable_data<T>();
+  T* grid_y_data = grid_y->template mutable_data<T>();
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  Clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  Clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+
+template <typename T>
+void GetGridPointValue(const Tensor& input,
+                       Tensor* output,
+                       const Tensor& x,
+                       const Tensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t =
+      EigenTensor<T, 4>::From(*output).setConstant(static_cast<T>(0));
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(x_t(i, k, l),
+                      y_t(i, k, l),
+                      static_cast<T>(in_w - 1),
+                      static_cast<T>(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i,
+                        j,
+                        static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void AllNeigbors(const LoongArchContext& ctx,
+                 const Tensor& input,
+                 Tensor* grid_x,
+                 Tensor* grid_y,
+                 Tensor* x_w,
+                 Tensor* x_e,
+                 Tensor* y_n,
+                 Tensor* y_s,  // positions
+                 Tensor* d_w,
+                 Tensor* d_e,
+                 Tensor* d_n,
+                 Tensor* d_s,  // distance
+                 Tensor* v_wn,
+                 Tensor* v_en,
+                 Tensor* v_ws,
+                 Tensor* v_es) {  // values
+  auto place = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  // calculate coords of 4 corner points
+  DDim dim{{n, out_h, out_w}};
+  x_w->Resize(dim);
+  x_e->Resize(dim);
+  y_n->Resize(dim);
+  y_s->Resize(dim);
+  x_w->template mutable_data<T>();
+  x_e->template mutable_data<T>();
+  y_n->template mutable_data<T>();
+  y_s->template mutable_data<T>();
+  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
+  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
+  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
+  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+
+  x_w_t.device(place) = grid_x_t.floor();
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
+  y_n_t.device(place) = grid_y_t.floor();
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
+
+  // calculate distances to 4 sides
+  d_w->Resize(dim);
+  d_e->Resize(dim);
+  d_n->Resize(dim);
+  d_s->Resize(dim);
+  d_w->template mutable_data<T>();
+  d_e->template mutable_data<T>();
+  d_n->template mutable_data<T>();
+  d_s->template mutable_data<T>();
+  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
+  d_w_t.device(place) = grid_x_t - x_w_t;
+  d_e_t.device(place) = x_e_t - grid_x_t;
+  d_n_t.device(place) = grid_y_t - y_n_t;
+  d_s_t.device(place) = y_s_t - grid_y_t;
+
+  // calc 4 corner points value
+  DDim v_dim{{n, c, out_h, out_w}};
+  v_wn->Resize(v_dim);
+  v_en->Resize(v_dim);
+  v_ws->Resize(v_dim);
+  v_es->Resize(v_dim);
+  v_wn->template mutable_data<T>();
+  v_en->template mutable_data<T>();
+  v_ws->template mutable_data<T>();
+  v_es->template mutable_data<T>();
+  GetGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  GetGridPointValue<T>(input, v_en, *x_e, *y_n);
+  GetGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  GetGridPointValue<T>(input, v_es, *x_e, *y_s);
+}
+
+template <typename T>
+void BilinearInter(const LoongArchContext& ctx,
+                   const Tensor& input,
+                   Tensor* grid_x,
+                   Tensor* grid_y,
+                   Tensor* out) {
+  auto place = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  Tensor x_w, x_e, y_n, y_s;
+  Tensor d_w, d_e, d_n, d_s;
+  Tensor v_wn, v_en, v_ws, v_es;
+
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,
+                 grid_y,
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+
+template <class T>
+void GridSamplerCompute<T>::Run() {
+#ifndef WIN32
+  auto& param = this->Param<param_t>();
+  auto& context = ctx_->As<LoongArchContext>();
+  auto* input = param.x;
+  auto* grid = param.grid;
+  auto* output = param.out;
+  const std::string padding_mode = param.padding_mode;
+  const std::string mode = param.mode;
+  const bool align_corners = param.align_corners;
+
+  auto input_dims = input->dims();
+  const int in_h = input_dims[2];
+  const int in_w = input_dims[3];
+
+  output->template mutable_data<T>();
+  lite::loongarch::math::SetConstant<TARGET(kLoongArch), T> set_zero;
+  set_zero(context, output, static_cast<T>(0));
+
+  Tensor grid_x, grid_y;
+  CalcGridLocations<T>(context,
+                       *grid,
+                       in_h,
+                       in_w,
+                       align_corners,
+                       padding_mode,
+                       &grid_x,
+                       &grid_y);
+  if (mode == "bilinear") {
+    BilinearInter<T>(context, *input, &grid_x, &grid_y, output);
+  } else if (mode == "nearest") {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GetGridPointValue<T>(*input, output, grid_x, grid_y);
+  }
+#else
+  LOG(FATAL) << "Error: This model is not supported on Windows Os yet, because "
+                "grid_sample op is not supported on windows Paddle-Lite, "
+                "please update your Paddle-Lite version.";
+#endif
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(grid_sampler,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::GridSamplerCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Grid", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/grid_sampler_compute.h b/lite/kernels/loongarch/grid_sampler_compute.h
new file mode 100644
index 00000000000..c66cadd1b3d
--- /dev/null
+++ b/lite/kernels/loongarch/grid_sampler_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <class T>
+class GridSamplerCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GridSamplerParam;
+
+  void Run() override;
+
+  virtual ~GridSamplerCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/group_norm_compute.cc b/lite/kernels/loongarch/group_norm_compute.cc
new file mode 100644
index 00000000000..f4c9175b2c3
--- /dev/null
+++ b/lite/kernels/loongarch/group_norm_compute.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/group_norm_compute.h"
+#include "lite/backends/loongarch/xxl.h"
+#include <cmath>
+#include "lite/backends/loongarch/math/include/group_norm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void GroupNormCompute::PrepareForRun() {}
+
+void GroupNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  const float* in = param.x->data<float>();
+  const float* scale =
+      param.scale == nullptr ? nullptr : param.scale->data<float>();
+  const float* bias =
+      param.bias == nullptr ? nullptr : param.bias->data<float>();
+  float* out = param.out->mutable_data<float>();
+  float* saved_mean = param.saved_mean->mutable_data<float>();
+  float* saved_variance = param.saved_variance->mutable_data<float>();
+  float epsilon = param.epsilon;
+  int groups = param.groups;
+
+  int n = param.x->dims()[0];
+  int c = param.x->dims()[1];
+  int height = param.x->dims()[2];
+  int width = param.x->dims()[3];
+
+  lite::loongarch::math::group_norm(in,
+                              out,
+                              n,
+                              c,
+                              height,
+                              width,
+                              epsilon,
+                              groups,
+                              scale,
+                              bias,
+                              saved_mean,
+                              saved_variance);
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(group_norm,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::GroupNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/group_norm_compute.h b/lite/kernels/loongarch/group_norm_compute.h
new file mode 100644
index 00000000000..3e447536ffa
--- /dev/null
+++ b/lite/kernels/loongarch/group_norm_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+class GroupNormCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GroupNormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~GroupNormCompute() = default;
+
+ private:
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/gru_compute.cc b/lite/kernels/loongarch/gru_compute.cc
new file mode 100644
index 00000000000..37aaccafe7a
--- /dev/null
+++ b/lite/kernels/loongarch/gru_compute.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/gru_compute.h"
+#include "lite/utils/env.h"
+
+// DEFINE_int32(paddle_num_threads,
+//              1,
+//              "Number of threads for each paddle instance.");
+int32_t paddle_num_threads =
+    paddle::lite::GetIntFromEnv("paddle_num_threads", 1);
+
+REGISTER_LITE_KERNEL(gru,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::GRUCompute<float>,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/gru_compute.h b/lite/kernels/loongarch/gru_compute.h
new file mode 100644
index 00000000000..8ae455635f5
--- /dev/null
+++ b/lite/kernels/loongarch/gru_compute.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/gru_compute.h"
+#include "lite/backends/loongarch/math/gru_cpu_kernel.h"
+#include "lite/backends/loongarch/math/gru_kernel.h"
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/backends/loongarch/math/sequence2batch.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+// DECLARE_int32(paddle_num_threads);
+extern int32_t paddle_num_threads;
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+using Tensor = lite::Tensor;
+
+template <typename T>
+inline void ReorderInitState(const lite::Context<TARGET(kLoongArch)>& context,
+                             const Tensor& src,
+                             const std::vector<uint64_t>& index_lod,
+                             Tensor* dst,
+                             bool indexed_src) {
+  lite::loongarch::math::CopyMatrixRowsFunctor<TARGET(kLoongArch), T> row_shuffle;
+  dst->Resize(src.dims());
+  dst->template mutable_data<T>();
+  row_shuffle(context, src, index_lod, dst, indexed_src);
+}
+
+static inline int64_t CalculateSeqWidth(const DDim& dims) {
+  return dims.count(1, dims.size());
+}
+
+template <typename T>
+class GRUCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override {
+    auto& context = ctx_->As<LoongArchContext>();
+    auto& param = *param_.get_mutable<operators::GRUParam>();
+
+    bool origin_mode = param.origin_mode;
+    bool is_reverse = param.is_reverse;
+
+    auto* input = param.input;
+    auto* h0 = param.h0;
+    auto* weight = param.weight;
+    const T* weight_data = weight->template data<T>();
+    auto* bias = param.bias;
+
+    auto* batch_gate = param.batch_gate;
+    auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
+    auto* batch_hidden = param.batch_hidden;
+    T* batch_gate_ptr = batch_gate->template mutable_data<T>();
+    T* batch_reset_hidden_prev_ptr =
+        batch_reset_hidden_prev->template mutable_data<T>();
+    T* batch_hidden_ptr = batch_hidden->template mutable_data<T>();
+
+    auto* hidden = param.hidden;
+    hidden->template mutable_data<T>();
+
+    const auto& hidden_dims = hidden->dims();
+
+    lite::loongarch::math::LoDTensor2BatchFunctor<TARGET(kLoongArch), T> to_batch;
+    to_batch(context, *input, batch_gate, true, is_reverse);
+
+    if (bias) {
+      lite::loongarch::math::RowwiseAdd<TARGET(kLoongArch), T> add_bias;
+      add_bias(context, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    lite::loongarch::math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      const std::vector<uint64_t>& order(batch_gate->lod()[2]);
+      ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.mutable_data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+
+    const auto& batch_starts = batch_gate->lod()[0];
+    size_t seq_len = batch_starts.size() - 1;
+    int64_t batch_gate_width = CalculateSeqWidth(batch_gate->dims());
+    int64_t batch_reset_hidden_prev_width =
+        CalculateSeqWidth(batch_reset_hidden_prev->dims());
+    int64_t batch_hidden_width = CalculateSeqWidth(batch_hidden->dims());
+    auto active_node =
+        lite::loongarch::math::detail::GetActivationType(param.activation);
+    auto active_gate =
+        lite::loongarch::math::detail::GetActivationType(param.gate_activation);
+
+      for (size_t n = 0; n < seq_len; n++) {
+        int64_t bstart = static_cast<int64_t>(batch_starts[n]);
+        int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
+        int64_t cur_batch_size = bend - bstart;
+
+        gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
+        gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
+        gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
+                                       bstart * batch_reset_hidden_prev_width;
+
+        lite::loongarch::math::GRUUnitFunctor<TARGET(kLoongArch), T>::compute(
+            context,
+            gru_value,
+            frame_size,
+            cur_batch_size,
+            active_node,
+            active_gate,
+            origin_mode);
+
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+
+    lite::loongarch::math::Batch2LoDTensorFunctor<TARGET(kLoongArch), T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(context, *batch_hidden, hidden);
+  }
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/gru_compute_test.cc b/lite/kernels/loongarch/gru_compute_test.cc
new file mode 100644
index 00000000000..872834d7196
--- /dev/null
+++ b/lite/kernels/loongarch/gru_compute_test.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/gru_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(gru_loongarch, retrive_op) {
+  auto gru = KernelRegistry::Global().Create("gru");
+  ASSERT_FALSE(gru.empty());
+  ASSERT_TRUE(gru.front());
+}
+
+TEST(gru_loongarch, init) {
+  GRUCompute<float> gru;
+  ASSERT_EQ(gru.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gru.target(), TARGET(kLoongArch));
+}
+
+TEST(gru_loongarch, run_test) {
+  lite::Tensor input, h0, weight, bias;
+  lite::Tensor batch_gate, batch_reset_hidden_prev, batch_hidden, hidden;
+  constexpr int batch_size = 9;
+  std::vector<int64_t> input_shape{batch_size, 15};
+  input.Resize(lite::DDim(input_shape));
+  std::vector<int64_t> weight_shape{5, 15};
+  weight.Resize(lite::DDim(weight_shape));
+  std::vector<int64_t> h0_shape{3, 5};
+  h0.Resize(lite::DDim(h0_shape));
+  std::vector<int64_t> bias_shape{1, 15};
+  bias.Resize(lite::DDim(bias_shape));
+  std::vector<int64_t> batch_gate_shape{batch_size, 15};
+  batch_gate.Resize(lite::DDim(batch_gate_shape));
+  std::vector<int64_t> batch_reset_hidden_prev_shape{batch_size, 5};
+  batch_reset_hidden_prev.Resize(lite::DDim(batch_reset_hidden_prev_shape));
+  std::vector<int64_t> batch_hidden_shape{batch_size, 5};
+  batch_hidden.Resize(lite::DDim(batch_hidden_shape));
+  std::vector<int64_t> hidden_shape{batch_size, 5};
+  hidden.Resize(lite::DDim(hidden_shape));
+
+  std::vector<std::vector<uint64_t>> lod{{0, 2, 6, 9}};
+  input.set_lod(lod);
+
+  auto input_data = input.mutable_data<float>();
+  auto weight_data = weight.mutable_data<float>();
+  auto h0_data = h0.mutable_data<float>();
+  auto bias_data = bias.mutable_data<float>();
+
+  for (int64_t i = 0; i < input.dims().production(); i++) {
+    input_data[i] = static_cast<float>(0);
+  }
+  for (int64_t i = 0; i < weight.dims().production(); i++) {
+    weight_data[i] = static_cast<float>(0);
+  }
+  for (int64_t i = 0; i < h0.dims().production(); i++) {
+    h0_data[i] = static_cast<float>(0);
+  }
+  for (int64_t i = 0; i < bias.dims().production(); i++) {
+    bias_data[i] = static_cast<float>(0);
+  }
+  // ReluCompute relu;
+  GRUCompute<float> gru;
+  operators::GRUParam param;
+
+  param.input = &input;
+  param.h0 = &h0;
+  param.weight = &weight;
+  param.bias = &bias;
+  param.batch_gate = &batch_gate;
+  param.batch_reset_hidden_prev = &batch_reset_hidden_prev;
+  param.batch_hidden = &batch_hidden;
+  param.hidden = &hidden;
+  param.gate_activation = "sigmoid";
+  param.activation = "tanh";
+  param.is_reverse = false;
+  param.origin_mode = false;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  gru.SetContext(std::move(ctx));
+  gru.SetParam(param);
+  gru.Run();
+
+  auto batch_gate_data = batch_gate.mutable_data<float>();
+  auto batch_reset_hidden_prev_data =
+      batch_reset_hidden_prev.mutable_data<float>();
+  auto batch_hidden_data = batch_hidden.mutable_data<float>();
+  auto hidden_data = hidden.mutable_data<float>();
+  std::vector<float> batch_gate_out{
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0,
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0};
+  std::vector<float> batch_reset_hidden_prev_out{
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> batch_hidden_out{
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> hidden_out{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < batch_gate.dims().production(); i++) {
+    LOG(INFO) << batch_gate_data[i];
+    EXPECT_NEAR(batch_gate_data[i], batch_gate_out[i], 1e-3);
+  }
+  for (int i = 0; i < batch_reset_hidden_prev.dims().production(); i++) {
+    LOG(INFO) << batch_reset_hidden_prev_data[i];
+    EXPECT_NEAR(
+        batch_reset_hidden_prev_data[i], batch_reset_hidden_prev_out[i], 1e-3);
+  }
+  for (int i = 0; i < batch_hidden.dims().production(); i++) {
+    LOG(INFO) << batch_hidden_data[i];
+    EXPECT_NEAR(batch_hidden_data[i], batch_hidden_out[i], 1e-3);
+  }
+  for (int i = 0; i < hidden.dims().production(); i++) {
+    LOG(INFO) << hidden_data[i];
+    EXPECT_NEAR(hidden_data[i], hidden_out[i], 1e-3);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(gru, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/gru_unit_compute.cc b/lite/kernels/loongarch/gru_unit_compute.cc
new file mode 100644
index 00000000000..f6914948644
--- /dev/null
+++ b/lite/kernels/loongarch/gru_unit_compute.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/gru_unit_compute.h"
+#include "lite/backends/loongarch/math/blas.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
+
+template <class T>
+void GRUUnitCompute<T>::Run() {
+#ifndef WIN32
+  auto& param = this->Param<param_t>();
+  auto& context = ctx_->As<LoongArchContext>();
+  auto* input = param.input;
+  auto* hidden_prev = param.hidden_prev;
+  auto* weight = param.weight;
+  auto* bias = param.bias;
+  auto* gate = param.gate;
+  gate->template mutable_data<T>();
+  auto* reset_hidden_prev = param.reset_hidden_prev;
+  reset_hidden_prev->template mutable_data<T>();
+  auto* hidden = param.hidden;
+  hidden->template mutable_data<T>();
+
+  int batch_size = input->dims()[0];
+  int frame_size = hidden_prev->dims()[1];
+
+  auto x = EigenMatrix<T>::From(*input);
+  auto h_p = EigenMatrix<T>::From(*hidden_prev);
+  auto g = EigenMatrix<T>::From(*gate);
+  auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
+  auto h = EigenMatrix<T>::From(*hidden);
+  const auto& place = lite::fluid::EigenDeviceType<lite::TargetType::kLoongArch>();
+
+  if (bias) {
+    auto b = EigenMatrix<T>::From(*bias);
+    g.device(place) = x +
+                      b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                          .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+  } else {
+    g.device(place) = x;
+  }
+
+  // calculate unactivated gate outputs
+  const T* hidden_prev_data = hidden_prev->template data<T>();
+  const T* weight_data = weight->template data<T>();
+  T* gate_data = gate->template mutable_data<T>();
+  T* reset_hidden_prev_data = reset_hidden_prev->template mutable_data<T>();
+  auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+  blas.GEMM(false,
+            false,
+            batch_size,
+            2 * frame_size,
+            frame_size,
+            1,
+            hidden_prev_data,
+            frame_size,
+            weight_data,
+            frame_size * 2,
+            1,
+            gate_data,
+            frame_size * 3);
+
+  // calculate activited gate
+  Eigen::array<int, 2> extents{{batch_size, frame_size}};
+  Eigen::array<int, 2> u_offsets{{0, 0}};
+  ActCompute(param.gate_activation,
+             place,
+             g.slice(u_offsets, extents),
+             g.slice(u_offsets, extents));
+  auto u = g.slice(u_offsets, extents);  // update gate
+  Eigen::array<int, 2> r_offsets{{0, frame_size}};
+  ActCompute(param.gate_activation,
+             place,
+             g.slice(r_offsets, extents),
+             g.slice(r_offsets, extents));
+  auto r = g.slice(r_offsets, extents);  // reset gate
+  r_h_p.device(place) = r * h_p;         // reset previous hidden state
+  blas.GEMM(false,
+            false,
+            batch_size,
+            frame_size,
+            frame_size,
+            1,
+            reset_hidden_prev_data,
+            frame_size,
+            weight_data + frame_size * frame_size * 2,
+            frame_size,
+            1,
+            gate_data + frame_size * 2,
+            frame_size * 3);
+
+  Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
+  ActCompute(param.activation,
+             place,
+             g.slice(c_offsets, extents),
+             g.slice(c_offsets, extents));
+  auto c = g.slice(c_offsets, extents);  // output candidate
+
+  // calculate final output
+  if (param.origin_mode) {
+    h.device(place) = c + u * (h_p - c);  // (1 - u) * c + u * h_p
+  } else {
+    h.device(place) = u * (c - h_p) + h_p;  // u * c + (1 - u) * h_p
+  }
+#else
+  LOG(FATAL) << "Error: this model is not supported on Windows Os yet, because "
+                "gru_unit kernel is not supported on Windows Paddle-Lite, "
+                "please update your Paddle-Lite version.";
+#endif
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(gru_unit,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::GRUUnitCompute<float>,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("HiddenPrev", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Gate", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("ResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/gru_unit_compute.h b/lite/kernels/loongarch/gru_unit_compute.h
new file mode 100644
index 00000000000..1b5ddf81665
--- /dev/null
+++ b/lite/kernels/loongarch/gru_unit_compute.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/activation_compute.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
+
+template <class T>
+class GRUUnitCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GRUUnitParam;
+
+  void Run() override;
+
+  virtual ~GRUUnitCompute() = default;
+
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const int act_type, const Device& d, X x, Y y) const {
+    switch (GRUActivationType(act_type)) {
+      case identity:
+        y.device(d) = x;
+        break;
+      case sigmoid:
+        SigmoidFunctor<T>()(d, x, y);
+        break;
+      case tanh:
+        TanhFunctor<T>()(d, x, y);
+        break;
+      case relu:
+        ReluFunctor<T>()(d, x, y);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported activation type, only supports identity, "
+                      "sigmoid, tanh and relu.";
+    }
+  }
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/instance_norm_compute.cc b/lite/kernels/loongarch/instance_norm_compute.cc
new file mode 100644
index 00000000000..e7a1dd86036
--- /dev/null
+++ b/lite/kernels/loongarch/instance_norm_compute.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/instance_norm_compute.h"
+#include "lite/backends/loongarch/xxl.h"
+#include <cmath>
+#include "lite/backends/loongarch/math/include/instance_norm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void InstanceNormCompute::PrepareForRun() {}
+
+void InstanceNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  const float* in = param.x->data<float>();
+  const float* scale =
+      param.scale == nullptr ? nullptr : param.scale->data<float>();
+  const float* bias =
+      param.bias == nullptr ? nullptr : param.bias->data<float>();
+  float* out = param.out->mutable_data<float>();
+  float* saved_mean = param.saved_mean->mutable_data<float>();
+  float* saved_variance = param.saved_variance->mutable_data<float>();
+  float epsilon = param.epsilon;
+
+  int n = param.x->dims()[0];
+  int c = param.x->dims()[1];
+  int height = param.x->dims()[2];
+  int width = param.x->dims()[3];
+  if (param.x->dims().size() == 5) {
+    width = param.x->dims()[3] * param.x->dims()[4];
+  }
+
+  lite::loongarch::math::instance_norm(in,
+                                 out,
+                                 n,
+                                 c,
+                                 height,
+                                 width,
+                                 epsilon,
+                                 scale,
+                                 bias,
+                                 saved_mean,
+                                 saved_variance);
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(instance_norm,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::InstanceNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/instance_norm_compute.h b/lite/kernels/loongarch/instance_norm_compute.h
new file mode 100644
index 00000000000..eeccf935cf0
--- /dev/null
+++ b/lite/kernels/loongarch/instance_norm_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+class InstanceNormCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::InstanceNormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~InstanceNormCompute() = default;
+
+ private:
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/interpolate_compute.cc b/lite/kernels/loongarch/interpolate_compute.cc
new file mode 100644
index 00000000000..1e2772c412a
--- /dev/null
+++ b/lite/kernels/loongarch/interpolate_compute.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/interpolate_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/math/interpolate.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void BilinearInterpCompute::Run() {
+  auto& param = Param<operators::InterpolateParam>();
+  // required input
+  lite::Tensor* X = param.X;
+  // optionla inputs
+  lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
+  // output
+  lite::Tensor* Out = param.Out;
+  // optional attributes
+  float scale = param.scale;
+  int out_w = param.out_w;
+  int out_h = param.out_h;
+  int align_mode = param.align_mode;
+  // required attributes
+  bool align_corners = param.align_corners;
+  std::string interp_method = "Bilinear";
+  lite::loongarch::math::interpolate(X,
+                               OutSize,
+                               SizeTensor,
+                               Scale,
+                               Out,
+                               scale,
+                               param.scale_v,
+                               out_h,
+                               out_w,
+                               align_mode,
+                               align_corners,
+                               interp_method);
+}
+
+void NearestInterpCompute::Run() {
+  auto& param = Param<operators::InterpolateParam>();
+  // required input
+  lite::Tensor* X = param.X;
+  // optionla inputs
+  lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
+  // output
+  lite::Tensor* Out = param.Out;
+  // optional attributes
+  float scale = param.scale;
+  int out_w = param.out_w;
+  int out_h = param.out_h;
+  int align_mode = param.align_mode;
+  // required attributes
+  bool align_corners = param.align_corners;
+  std::string interp_method = "Nearest";
+  lite::loongarch::math::interpolate(X,
+                               OutSize,
+                               SizeTensor,
+                               Scale,
+                               Out,
+                               scale,
+                               param.scale_v,
+                               out_h,
+                               out_w,
+                               align_mode,
+                               align_corners,
+                               interp_method);
+}
+
+void NearestInterpComputeV2::Run() {
+  auto& param = Param<operators::InterpolateParam>();
+  // required input
+  lite::Tensor* X = param.X;
+  // optionla inputs
+  lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
+  // output
+  lite::Tensor* Out = param.Out;
+  // optional attributes
+  float scale = param.scale;
+  int out_w = param.out_w;
+  int out_h = param.out_h;
+  int align_mode = param.align_mode;
+  // required attributes
+  bool align_corners = param.align_corners;
+  std::string interp_method = "Nearest";
+  lite::loongarch::math::interpolate_v2(X,
+                                  OutSize,
+                                  SizeTensor,
+                                  Scale,
+                                  Out,
+                                  scale,
+                                  param.scale_v,
+                                  out_h,
+                                  out_w,
+                                  align_mode,
+                                  align_corners,
+                                  interp_method);
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(bilinear_interp,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::BilinearInterpCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(nearest_interp,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::NearestInterpCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(bilinear_interp_v2,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::BilinearInterpCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(nearest_interp_v2,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::NearestInterpComputeV2,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/interpolate_compute.h b/lite/kernels/loongarch/interpolate_compute.h
new file mode 100644
index 00000000000..4647fcd9618
--- /dev/null
+++ b/lite/kernels/loongarch/interpolate_compute.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+class BilinearInterpCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~BilinearInterpCompute() = default;
+};
+
+class NearestInterpCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~NearestInterpCompute() = default;
+};
+
+class NearestInterpComputeV2
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~NearestInterpComputeV2() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/layer_norm_compute.cc b/lite/kernels/loongarch/layer_norm_compute.cc
new file mode 100644
index 00000000000..2d83fb7a172
--- /dev/null
+++ b/lite/kernels/loongarch/layer_norm_compute.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/layer_norm_compute.h"
+
+REGISTER_LITE_KERNEL(layer_norm,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::LayerNormCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/layer_norm_compute.h b/lite/kernels/loongarch/layer_norm_compute.h
new file mode 100644
index 00000000000..9ebaaea5e75
--- /dev/null
+++ b/lite/kernels/loongarch/layer_norm_compute.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/loongarch/jit/helper.h"
+#include "lite/backends/loongarch/jit/kernel_base.h"
+#include "lite/backends/loongarch/jit/kernels.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class LayerNormCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LayerNormParam;
+
+  void Run() override {
+    auto &param = *param_.get_mutable<param_t>();
+    float epsilon = param.epsilon;
+    auto Scale = param.Scale;
+    auto Bias = param.Bias;
+    auto x = param.X;
+
+    auto y = param.Y;
+    auto Mean = param.Mean;
+    auto Var = param.Variance;
+    auto begin_norm_axis = param.begin_norm_axis;
+
+    auto x_dims = x->dims();
+
+    y->template mutable_data<T>();
+    Mean->template mutable_data<T>();
+    Var->template mutable_data<T>();
+
+    auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    lite::DDim matrix_shape({left, right});
+
+    lite::Tensor in;
+    in.ShareDataWith(*x);
+    in.Resize(matrix_shape);
+    lite::Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    CHECK_EQ(Mean->numel(), left);
+    CHECK_EQ(Var->numel(), left);
+    CHECK_EQ(Scale->numel(), right);
+    CHECK_EQ(Bias->numel(), right);
+
+    auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<T>,
+                                              lite::fluid::CPUPlace>::Cache()
+                   .At(right);
+    ker(in.mutable_data<T>(),
+        out.mutable_data<T>(),
+        Mean->template mutable_data<T>(),
+        Var->template mutable_data<T>(),
+        Scale->template data<T>(),
+        Bias->template data<T>(),
+        static_cast<int>(left),
+        epsilon,
+        right);
+  }
+
+  virtual ~LayerNormCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/layer_norm_compute_test.cc b/lite/kernels/loongarch/layer_norm_compute_test.cc
new file mode 100644
index 00000000000..4ef3c160dd7
--- /dev/null
+++ b/lite/kernels/loongarch/layer_norm_compute_test.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/backends/loongarch/jit/helper.h"
+#include "lite/backends/loongarch/jit/kernel_base.h"
+#include "lite/backends/loongarch/jit/kernels.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/layer_norm_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+std::vector<float> ref(lite::Tensor* x,
+                       lite::Tensor* Scale,
+                       lite::Tensor* Bias,
+                       lite::Tensor* y,
+                       lite::Tensor* Mean,
+                       lite::Tensor* Var,
+                       int begin_norm_axis,
+                       float epsilon) {
+  auto x_dims = x->dims();
+
+  y->mutable_data<float>();
+  Mean->mutable_data<float>();
+  Var->mutable_data<float>();
+
+  auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  lite::DDim matrix_shape({left, right});
+
+  x->Resize(matrix_shape);
+  Tensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+  auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<float>,
+                                            lite::fluid::CPUPlace>::Cache()
+                 .At(right);
+  ker(x->mutable_data<float>(),
+      out.mutable_data<float>(),
+      Mean->mutable_data<float>(),
+      Var->mutable_data<float>(),
+      Scale->data<float>(),
+      Bias->data<float>(),
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+
+  std::vector<float> ref_data;
+  auto result = out.mutable_data<float>();
+  for (int i = 0; i < y->dims().production(); ++i) {
+    ref_data.emplace_back(result[i]);
+  }
+  return ref_data;
+}
+
+// layer_norm
+TEST(layer_norm_loongarch, retrive_op) {
+  auto layer_norm = KernelRegistry::Global().Create("layer_norm");
+  ASSERT_FALSE(layer_norm.empty());
+  ASSERT_TRUE(layer_norm.front());
+}
+
+TEST(layer_norm_loongarch, init) {
+  lite::kernels::loongarch::LayerNormCompute<float> layer_norm;
+  ASSERT_EQ(layer_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(layer_norm.target(), TARGET(kLoongArch));
+}
+
+TEST(layer_norm_loongarch, run_test) {
+  lite::Tensor x;
+  lite::Tensor Scale;
+  lite::Tensor Bias;
+
+  lite::Tensor out;
+  lite::Tensor Mean;
+  lite::Tensor Var;
+
+  std::vector<int64_t> x_shape({1, 2, 3, 1});
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape({1, 2, 3, 1});
+  out.Resize(lite::DDim(out_shape));
+
+  int begin_norm_axis = 0;
+  float epsilon = 1e-5;
+  int pre = 1;
+  int post = 1;
+  for (int i = 0; i < begin_norm_axis; ++i) {
+    pre *= x_shape[i];
+  }
+  for (size_t i = begin_norm_axis; i < x_shape.size(); ++i) {
+    post *= x_shape[i];
+  }
+  std::vector<int64_t> scale_shape({post});
+  Scale.Resize(scale_shape);
+  std::vector<int64_t> bias_shape({post});
+  Bias.Resize(bias_shape);
+
+  auto x_data = x.mutable_data<float>();
+  auto scale_data = Scale.mutable_data<float>();
+  auto bias_data = Bias.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto mean_data = Mean.mutable_data<float>();
+  auto var_data = Var.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < Scale.dims().production(); ++i) {
+    scale_data[i] = 1.5;
+  }
+  for (int64_t i = 0; i < Bias.dims().production(); ++i) {
+    bias_data[i] = 0.25;
+  }
+
+  LayerNormCompute<float> layer_norm;
+  operators::LayerNormParam param;
+
+  param.X = &x;
+  param.Y = &out;
+  param.Scale = &Scale;
+  param.Bias = &Bias;
+  param.Mean = &Mean;
+  param.Variance = &Var;
+  param.begin_norm_axis = begin_norm_axis;
+  param.epsilon = epsilon;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  layer_norm.SetContext(std::move(ctx));
+  layer_norm.SetParam(param);
+  layer_norm.Run();
+
+  std::vector<float> ref_data =
+      ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon);
+  for (int j = 0; j < out.dims().production(); ++j) {
+    EXPECT_NEAR(out_data[j], ref_data[j], 1e-5);
+  }
+  LOG(INFO) << *mean_data;
+  LOG(INFO) << *var_data;
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(layer_norm, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/lookup_table_compute.cc b/lite/kernels/loongarch/lookup_table_compute.cc
new file mode 100644
index 00000000000..8c4265ccff0
--- /dev/null
+++ b/lite/kernels/loongarch/lookup_table_compute.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/lookup_table_compute.h"
+
+using LookupTableFloatInt64 =
+    paddle::lite::kernels::loongarch::LookupTableCompute<float, int64_t>;
+using LookupTableFloatInt32 =
+    paddle::lite::kernels::loongarch::LookupTableCompute<float, int32_t>;
+
+REGISTER_LITE_KERNEL(
+    lookup_table, kLoongArch, kFloat, kNCHW, LookupTableFloatInt64, def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    lookup_table_v2, kLoongArch, kFloat, kNCHW, LookupTableFloatInt64, def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindPaddleOpVersion("lookup_table_v2", 1)
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    lookup_table, kLoongArch, kFloat, kNCHW, LookupTableFloatInt32, float_int32)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    lookup_table_v2, kLoongArch, kFloat, kNCHW, LookupTableFloatInt32, float_int32)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindPaddleOpVersion("lookup_table_v2", 1)
+    .Finalize();
diff --git a/lite/kernels/loongarch/lookup_table_compute.h b/lite/kernels/loongarch/lookup_table_compute.h
new file mode 100644
index 00000000000..10766678275
--- /dev/null
+++ b/lite/kernels/loongarch/lookup_table_compute.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T_W, typename T_IDS>
+class LookupTableCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LookupTableParam;
+
+  void Run() override {
+    auto &param = *param_.get_mutable<operators::LookupTableParam>();
+    auto *ids_t = param.Ids;
+    auto *output_t = param.Out;
+    int64_t padding_idx = param.padding_idx;
+    const T_IDS *ids = ids_t->template data<T_IDS>();
+    int64_t ids_numel = ids_t->dims().production();
+
+    auto *table_t = param.W;
+    int64_t row_number = table_t->dims()[0];
+    int64_t row_width = table_t->dims()[1];
+
+    const T_W *table = table_t->template data<T_W>();
+    T_W *output = output_t->template mutable_data<T_W>();
+    memset(output, 0, output_t->dims().production() * sizeof(T_W));
+    for (int64_t i = 0; i < ids_numel; ++i) {
+      if (padding_idx != -1 && ids[i] == padding_idx) {
+        memset(output + i * row_width, 0, row_width * sizeof(T_W));
+      } else {
+        CHECK_LT(ids[i], row_number) << "i = " << i;
+        CHECK_GE(ids[i], 0) << "i = " << i;
+        memcpy(output + i * row_width,
+               table + ids[i] * row_width,
+               row_width * sizeof(T_W));
+      }
+    }
+  }
+
+  virtual ~LookupTableCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/lookup_table_compute_test.cc b/lite/kernels/loongarch/lookup_table_compute_test.cc
new file mode 100644
index 00000000000..fe2cc1425f9
--- /dev/null
+++ b/lite/kernels/loongarch/lookup_table_compute_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/lookup_table_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(lookup_table_loongarch, compute) {
+  LookupTableCompute<float, int64_t> lookup_table;
+  operators::LookupTableParam param;
+  lite::Tensor w, ids, out, out_ref;
+  int64_t padding_idx = -1;
+
+  int vocab_size = 40;
+  int emb_size = 50;
+  int ids_h = 30;
+  int ids_w = 20;
+
+  auto w_dim = DDim({vocab_size, emb_size});
+  auto ids_dim = DDim({ids_h, ids_w});
+  auto out_dim = DDim({ids_h, ids_w, emb_size});
+
+  w.Resize(w_dim);
+  ids.Resize(ids_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+
+  auto* w_data = w.mutable_data<float>();
+  auto* ids_data = ids.mutable_data<int64_t>();
+  auto* out_data = out.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  int w_num = w_dim.production();
+  for (int i = 0; i < w_num; i++) {
+    w_data[i] = static_cast<float>(i + 1) / (w_num + 1);
+  }
+  int ids_num = ids_dim.production();
+  for (int i = 0; i < ids_num; i++) {
+    ids_data[i] = i % vocab_size;
+  }
+  int out_num = out_dim.production();
+  for (int i = 0; i < out_num; i++) {
+    out_ref_data[i] =
+        static_cast<float>((i % (vocab_size * emb_size)) + 1) / (w_num + 1);
+  }
+
+  param.W = &w;
+  param.Ids = &ids;
+  param.Out = &out;
+  param.padding_idx = padding_idx;
+  lookup_table.SetParam(param);
+  lookup_table.Run();
+  for (int i = 0; i < out_num; i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(lookup_table, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/match_matrix_tensor_compute.cc b/lite/kernels/loongarch/match_matrix_tensor_compute.cc
new file mode 100644
index 00000000000..43d4ca9ce04
--- /dev/null
+++ b/lite/kernels/loongarch/match_matrix_tensor_compute.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/match_matrix_tensor_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+void MatchMatrixTensorCompute<T>::Run() {
+  auto& context = ctx_->As<LoongArchContext>();
+  auto& param = this->Param<param_t>();
+  auto* x = param.x;
+  auto* w = param.w;
+  auto* y = param.y;
+  auto* out = param.out;
+  auto* tmp = param.tmp;
+  int dim_t = param.dim_t;
+  int dim_in = x->dims()[1];
+
+  const auto& offset_l = x->lod()[0];
+  const auto& offset_r = y->lod()[0];
+
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    int len_l = offset_l[b + 1] - offset_l[b];
+    int len_r = offset_r[b + 1] - offset_r[b];
+    top_size += dim_t * len_l * len_r;
+    top_offset.push_back(top_size);
+  }
+
+  auto* bottom_l_data = x->template data<T>();
+  auto* bottom_r_data = y->template data<T>();
+  auto* t_data = w->template data<T>();
+  auto* out_data = out->template mutable_data<T>();
+  auto* bottom_l_trans_data = tmp->template mutable_data<T>();
+  memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T));
+  memset(bottom_l_trans_data, 0.0, tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
+
+  auto blas = lite::loongarch::math::GetBlas<TARGET(kLoongArch), T>(context);
+  blas.GEMM(CblasNoTrans,
+            CblasNoTrans,
+            x->dims()[0],
+            dim_t * dim_in,
+            dim_in,
+            1.0f,
+            bottom_l_data,
+            dim_in,
+            t_data,
+            dim_t * dim_in,
+            0.0f,
+            bottom_l_trans_data,
+            dim_t * dim_in);
+
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    for (int t = 0; t < dim_t; t++) {
+      int len_l = offset_l[b + 1] - offset_l[b];
+      int len_r = offset_r[b + 1] - offset_r[b];
+      auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
+      const auto* l_t_data =
+          bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
+      const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
+
+      auto blas = lite::loongarch::math::GetBlas<TARGET(kLoongArch), T>(context);
+      blas.GEMM(CblasNoTrans,
+                CblasTrans,
+                len_l,
+                len_r,
+                dim_in,
+                1.0f,
+                l_t_data,
+                dim_t * dim_in,
+                r_data,
+                dim_in,
+                0.0f,
+                top_data,
+                len_r);
+    }
+  }
+
+  int batch_size = x->lod()[0].size() - 1;
+  int lod_lv1_size = batch_size * dim_t;
+  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  std::vector<uint64_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<uint64_t> out_lod2(lod_lv2_size + 1, 0);
+  for (int i = 0; i < batch_size; i++) {
+    out_lod0[i + 1] = out_lod0[i] + dim_t;
+    int len_l = offset_l[i + 1] - offset_l[i];
+
+    for (int j = 0; j < dim_t; j++) {
+      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+      int len_r = offset_r[i + 1] - offset_r[i];
+
+      for (int k = 0; k < len_l; k++) {
+        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+      }
+    }
+  }
+
+  LoD out_lod;
+  out_lod.push_back(top_offset);
+  out_lod.push_back(offset_l);
+  out_lod.push_back(offset_r);
+  out->set_lod(out_lod);
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    match_matrix_tensor,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::MatchMatrixTensorCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/match_matrix_tensor_compute.h b/lite/kernels/loongarch/match_matrix_tensor_compute.h
new file mode 100644
index 00000000000..88ee906b5a1
--- /dev/null
+++ b/lite/kernels/loongarch/match_matrix_tensor_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class MatchMatrixTensorCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatchMatrixTensorParam;
+
+  void Run() override;
+
+  virtual ~MatchMatrixTensorCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/match_matrix_tensor_compute_test.cc b/lite/kernels/loongarch/match_matrix_tensor_compute_test.cc
new file mode 100644
index 00000000000..ffee485b484
--- /dev/null
+++ b/lite/kernels/loongarch/match_matrix_tensor_compute_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/match_matrix_tensor_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(match_matrix_tensor_loongarch, retrive_op) {
+  auto kernel = KernelRegistry::Global().Create("match_matrix_tensor");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(match_matrix_tensor_loongarch, init) {
+  MatchMatrixTensorCompute<float> mmtc;
+  ASSERT_EQ(mmtc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mmtc.target(), TARGET(kLoongArch));
+}
+
+TEST(match_matrix_tensor_loongarch, run_test) {
+  int ix = 5, iy = 4, h = 2, dim_t = 2;
+  lite::Tensor x, w, y, out, tmp;
+  x.Resize({ix, h});
+  w.Resize({h, dim_t, h});
+  y.Resize({iy, h});
+  out.Resize({18, 1});
+  tmp.Resize({20, 1});
+
+  LoD x_lod{};
+  x_lod.push_back({0, 2, 5});
+  x.set_lod(x_lod);
+  LoD y_lod{};
+  y_lod.push_back({0, 3, 4});
+  y.set_lod(y_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* y_data = y.mutable_data<float>();
+  for (int64_t i = 0; i < y.numel(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+  auto* w_data = w.mutable_data<float>();
+  for (int64_t i = 0; i < w.numel(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  MatchMatrixTensorCompute<float> mmtc;
+  mmtc.SetContext(std::move(ctx));
+
+  operators::MatchMatrixTensorParam param;
+  param.x = &x;
+  param.w = &w;
+  param.y = &y;
+  param.dim_t = dim_t;
+  param.out = &out;
+  param.tmp = &tmp;
+
+  mmtc.SetParam(param);
+  mmtc.Run();
+
+  std::vector<float> ref_results = {5,
+                                    23,
+                                    41,
+                                    17,
+                                    75,
+                                    133,
+                                    7,
+                                    33,
+                                    59,
+                                    27,
+                                    125,
+                                    223,
+                                    323,
+                                    455,
+                                    587,
+                                    557,
+                                    793,
+                                    1029};
+  auto* out_data = out.mutable_data<float>();
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+    // LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(match_matrix_tensor, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/matmul_compute.cc b/lite/kernels/loongarch/matmul_compute.cc
new file mode 100644
index 00000000000..2ae07d1d68b
--- /dev/null
+++ b/lite/kernels/loongarch/matmul_compute.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/matmul_compute.h"
+
+REGISTER_LITE_KERNEL(matmul,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::MatMulCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/matmul_compute.h b/lite/kernels/loongarch/matmul_compute.h
new file mode 100644
index 00000000000..ec29e852861
--- /dev/null
+++ b/lite/kernels/loongarch/matmul_compute.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return lite::DDim({1, x_dim[0]});
+}
+
+/**
+ * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return lite::DDim({y_dim[0], 1});
+}
+
+template <typename T>
+class MatMulCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void Run() override {
+    auto &context = ctx_->As<LoongArchContext>();
+    auto &param = *param_.get_mutable<operators::MatMulParam>();
+
+    auto *x = param.X;
+    auto *y = param.Y;
+    auto *out = param.Out;
+    out->template mutable_data<T>();
+
+    auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    auto mat_dim_a = lite::loongarch::math::CreateMatrixDescriptor(
+        RowMatrixFromVector(x->dims()), 0, param.transpose_X);
+    auto mat_dim_b = lite::loongarch::math::CreateMatrixDescriptor(
+        ColumnMatrixFromVector(y->dims()), 0, param.transpose_Y);
+    auto scale = static_cast<T>(param.alpha);
+    blas.MatMul(*x, mat_dim_a, *y, mat_dim_b, scale, out, T(0));
+  }
+
+  virtual ~MatMulCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/matmul_compute_test.cc b/lite/kernels/loongarch/matmul_compute_test.cc
new file mode 100644
index 00000000000..2a19d500216
--- /dev/null
+++ b/lite/kernels/loongarch/matmul_compute_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/matmul_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(matmul_loongarch, retrive_op) {
+  auto matmul = KernelRegistry::Global().Create("matmul");
+  ASSERT_FALSE(matmul.empty());
+  ASSERT_TRUE(matmul.front());
+}
+
+TEST(matmul_loongarch, init) {
+  lite::kernels::loongarch::MatMulCompute<float> matmul;
+  ASSERT_EQ(matmul.precision(), PRECISION(kFloat));
+  ASSERT_EQ(matmul.target(), TARGET(kLoongArch));
+}
+
+TEST(matmul_loongarch, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{2, 4};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 4};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+  // MatMulCompute matmul;
+  MatMulCompute<float> matmul;
+  operators::MatMulParam param;
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  matmul.SetContext(std::move(ctx));
+  matmul.SetParam(param);
+  matmul.Run();
+
+  std::vector<float> ref_result = {4, 5, 6, 7, 12, 17, 22, 27, 20, 29, 38, 47};
+
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_result[i], 1e-3);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(matmul, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/matmul_v2_compute.cc b/lite/kernels/loongarch/matmul_v2_compute.cc
new file mode 100644
index 00000000000..62026d84baa
--- /dev/null
+++ b/lite/kernels/loongarch/matmul_v2_compute.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/matmul_v2_compute.h"
+
+REGISTER_LITE_KERNEL(matmul_v2,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::MatMulV2Compute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/matmul_v2_compute.h b/lite/kernels/loongarch/matmul_v2_compute.h
new file mode 100644
index 00000000000..811b2534821
--- /dev/null
+++ b/lite/kernels/loongarch/matmul_v2_compute.h
@@ -0,0 +1,305 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+#define INIT_PARAM                                                           \
+  auto& ctx = this->ctx_->template As<LoongArchContext>();                         \
+  auto& param = *param_.get_mutable<operators::MatMulParam>();               \
+  auto x_dims = param.X->dims();                                             \
+  auto y_dims = param.Y->dims();                                             \
+  int m, n, k;                                                               \
+  int lda, ldb, ldc;                                                         \
+  bool x_transpose = param.transpose_X;                                      \
+  bool y_transpose = param.transpose_Y;                                      \
+  if ((x_dims.size() >= 2 && y_dims.size() >= 2) &&                          \
+      (x_dims.size() != 2 || y_dims.size() != 2)) {                          \
+    if (!x_transpose) {                                                      \
+      m = x_dims[x_dims.size() - 2];                                         \
+      k = x_dims[x_dims.size() - 1];                                         \
+      lda = k;                                                               \
+    } else {                                                                 \
+      m = x_dims[x_dims.size() - 1];                                         \
+      k = x_dims[x_dims.size() - 2];                                         \
+      lda = m;                                                               \
+    }                                                                        \
+    if (!y_transpose) {                                                      \
+      n = y_dims[y_dims.size() - 1];                                         \
+      ldb = n;                                                               \
+      CHECK_EQ(k, y_dims[y_dims.size() - 2])                                 \
+          << "k must be equal y_dims[y_dims.size() - 2]";                    \
+    } else {                                                                 \
+      n = y_dims[y_dims.size() - 2];                                         \
+      ldb = k;                                                               \
+      CHECK_EQ(k, y_dims[y_dims.size() - 1])                                 \
+          << "k must be equal y_dims[y_dims.size() - 1]";                    \
+    }                                                                        \
+    ldc = n;                                                                 \
+    if (x_dims.size() > 2 && y_dims.size() > 2) {                            \
+      auto sum_x = x_dims.count(0, x_dims.size() - 2);                       \
+      auto sum_y = y_dims.count(0, y_dims.size() - 2);                       \
+      CHECK_EQ(sum_x, sum_y)                                                 \
+          << "sum_x(x_dims[0]+..x_dims[size()-2]) must be equal with "       \
+             "sum_y(y_dims[0]+..y_dims[size()-2])";                          \
+    }                                                                        \
+  } else if ((x_dims.size() == 2 && y_dims.size() == 2) ||                   \
+             (x_dims.size() == 2 && y_dims.size() == 1)) {                   \
+    if (!x_transpose) {                                                      \
+      m = x_dims[0];                                                         \
+      k = x_dims[1];                                                         \
+      lda = k;                                                               \
+    } else {                                                                 \
+      m = x_dims[1];                                                         \
+      k = x_dims[0];                                                         \
+      lda = m;                                                               \
+    }                                                                        \
+    if (!y_transpose) {                                                      \
+      if (y_dims.size() > 1) {                                               \
+        n = y_dims[1];                                                       \
+      } else {                                                               \
+        n = 1;                                                               \
+      }                                                                      \
+      ldb = n;                                                               \
+      CHECK_EQ(k, y_dims[0]) << "k must be equal y_dims[0]";                 \
+    } else {                                                                 \
+      if (y_dims.size() > 1) {                                               \
+        n = y_dims[0];                                                       \
+        CHECK_EQ(k, y_dims[1]) << "k must be equal y_dims[1]";               \
+      } else {                                                               \
+        n = 1;                                                               \
+        CHECK_EQ(k, y_dims[0]) << "k must be equal y_dims[0]";               \
+      }                                                                      \
+      ldb = k;                                                               \
+    }                                                                        \
+    ldc = n;                                                                 \
+  } else if (x_dims.size() >= 2 && y_dims.size() == 1) {                     \
+    n = 1;                                                                   \
+    k = y_dims[0];                                                           \
+    if (!x_transpose) {                                                      \
+      m = x_dims.count(0, x_dims.size() - 1);                                \
+      CHECK_EQ(k, x_dims[x_dims.size() - 1])                                 \
+          << "k must be equal x_dims[x_dims.size() - 1]";                    \
+    } else {                                                                 \
+      m = x_dims.count(1, x_dims.size() - 1);                                \
+      CHECK_EQ(k, x_dims[0]) << "k must be equal x_dims[0]";                 \
+    }                                                                        \
+    lda = k;                                                                 \
+    ldb = n;                                                                 \
+    ldc = n;                                                                 \
+  } else if (y_dims.size() >= 2 && x_dims.size() == 1) {                     \
+    m = 1;                                                                   \
+    k = x_dims[0];                                                           \
+    if (!y_transpose) {                                                      \
+      n = y_dims.count(1, y_dims.size());                                    \
+      CHECK_EQ(k, y_dims[0]) << "k must be equal y_dims[0]";                 \
+    } else {                                                                 \
+      n = y_dims.count(0, y_dims.size() - 1);                                \
+      CHECK_EQ(k, y_dims[y_dims.size() - 1])                                 \
+          << "k must be equal y_dims[y_dims.size() - 1]";                    \
+    }                                                                        \
+    lda = k;                                                                 \
+    ldb = n;                                                                 \
+    ldc = n;                                                                 \
+  } else if (x_dims.size() == 1 && y_dims.size() == 1) {                     \
+    m = 1;                                                                   \
+    n = 1;                                                                   \
+    k = x_dims[0];                                                           \
+    if (x_transpose == true && y_transpose == true) {                        \
+      m = x_dims[0];                                                         \
+      k = 1;                                                                 \
+      n = y_dims[0];                                                         \
+    } else if (x_transpose == false && y_transpose == false) {               \
+      CHECK_EQ(x_dims[0], y_dims[0]) << "x_dims[0] must be equal y_dims[0]"; \
+    } else {                                                                 \
+      LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims("     \
+                 << y_dims << ")"                                            \
+                 << ", when x_transpose is " << x_transpose                  \
+                 << " and y_transpose is " << y_transpose;                   \
+    }                                                                        \
+    lda = k;                                                                 \
+    ldb = n;                                                                 \
+    ldc = n;                                                                 \
+  } else {                                                                   \
+    LOG(FATAL) << "This x_dims: " << x_dims << " and y_dims: " << y_dims     \
+               << " doesn't support!";                                       \
+  }
+
+template <typename T>
+class MatMulV2Compute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void Run() override {
+    INIT_PARAM;
+    const auto* x_data = param.X->template data<T>();
+    const auto* y_data = param.Y->template data<T>();
+    auto* o_data = param.Out->template mutable_data<T>();
+    auto o_dims = param.Out->dims();
+    auto alpha = param.alpha;
+
+    auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, T>(ctx);
+
+    if ((x_dims.size() >= 2 && y_dims.size() >= 2) &&
+        (x_dims.size() != 2 || y_dims.size() != 2)) {
+      // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+      // x: [B, M, K], y: [K, N], out: [B, M, N]
+      // or
+      // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+      // x: [M, K], y: [B, K, N], out: [B, M, N]
+      int x_inner = x_dims[x_dims.size() - 2] * x_dims[x_dims.size() - 1];
+      int y_inner = y_dims[y_dims.size() - 2] * y_dims[y_dims.size() - 1];
+      int out_inner = o_dims[o_dims.size() - 2] * o_dims[o_dims.size() - 1];
+
+      if (x_dims.size() > 2 && y_dims.size() > 2) {
+        for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
+          blas.GEMM(x_transpose,
+                    y_transpose,
+                    m,
+                    n,
+                    k,
+                    alpha,
+                    x_data + i * x_inner,
+                    lda,
+                    y_data + i * y_inner,
+                    ldb,
+                    0.f,
+                    o_data + i * out_inner,
+                    ldc);
+        }
+      } else if (x_dims.size() > 2 && y_dims.size() == 2) {
+        for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
+          blas.GEMM(x_transpose,
+                    y_transpose,
+                    m,
+                    n,
+                    k,
+                    alpha,
+                    x_data + i * x_inner,
+                    lda,
+                    y_data,
+                    ldb,
+                    0.f,
+                    o_data + i * out_inner,
+                    ldc);
+        }
+      } else if (x_dims.size() == 2 && y_dims.size() > 2) {
+        for (size_t i = 0; i < y_dims.count(0, y_dims.size() - 2); ++i) {
+          blas.GEMM(x_transpose,
+                    y_transpose,
+                    m,
+                    n,
+                    k,
+                    alpha,
+                    x_data,
+                    lda,
+                    y_data + i * y_inner,
+                    ldb,
+                    0.f,
+                    o_data + i * out_inner,
+                    ldc);
+        }
+      }
+    } else if (x_dims.size() == 2 && y_dims.size() == 2) {
+      // x: [M, K], y: [K, N], out: [M, N]
+      blas.GEMM(x_transpose,
+                y_transpose,
+                m,
+                n,
+                k,
+                alpha,
+                x_data,
+                lda,
+                y_data,
+                ldb,
+                0.f,
+                o_data,
+                ldc);
+    } else if (x_dims.size() >= 2 && y_dims.size() == 1) {
+      // x: [B, M, K], y: [K], out: [B, M]
+      blas.GEMM(x_transpose,
+                false,
+                m,
+                n,
+                k,
+                alpha,
+                x_data,
+                lda,
+                y_data,
+                ldb,
+                0.f,
+                o_data,
+                ldc);
+    } else if (y_dims.size() >= 2 && x_dims.size() == 1) {
+      // y: [B, K, N], x: [K], out: [B, N]
+      blas.GEMM(false,
+                y_transpose,
+                m,
+                n,
+                k,
+                alpha,
+                x_data,
+                lda,
+                y_data,
+                ldb,
+                0.f,
+                o_data,
+                ldc);
+    } else if (x_dims.size() == 1 && y_dims.size() == 1) {
+      // x: [K], y: [K], out: [1]
+      if (x_transpose == false && y_transpose == false) {
+        o_data[0] = 0.;
+        for (size_t i = 0; i < x_dims[0]; ++i) {
+          o_data[0] += x_data[i] * y_data[i] * alpha;
+        }
+      } else if (x_transpose == true && y_transpose == true) {
+        blas.GEMM(false,
+                  false,
+                  m,
+                  n,
+                  k,
+                  alpha,
+                  x_data,
+                  lda,
+                  y_data,
+                  ldb,
+                  0.f,
+                  o_data,
+                  ldc);
+      } else {
+        LOG(FATAL) << "not supported x_dims.(" << x_dims << ") and y_dims("
+                   << y_dims << ")"
+                   << ", and x_transpose: " << x_transpose
+                   << ", y_transpose: " << y_transpose;
+      }
+    } else {
+      LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims("
+                 << y_dims << ")";
+    }
+  }
+
+  virtual ~MatMulV2Compute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/mul_compute.cc b/lite/kernels/loongarch/mul_compute.cc
new file mode 100644
index 00000000000..adea640fb55
--- /dev/null
+++ b/lite/kernels/loongarch/mul_compute.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/mul_compute.h"
+
+REGISTER_LITE_KERNEL(mul,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::MulCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/mul_compute.h b/lite/kernels/loongarch/mul_compute.h
new file mode 100644
index 00000000000..ee01a6c4ee0
--- /dev/null
+++ b/lite/kernels/loongarch/mul_compute.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+// using Tensor = framework::Tensor;
+inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src, int num_col_dims) {
+  int rank = src.dims().size();
+  if (rank == 2) {
+    return src;
+  }
+  lite::Tensor res;
+  res.ShareDataWith(src);
+  res.Resize(src.dims().Flatten2D(num_col_dims));
+  return res;
+}
+
+template <typename T>
+class MulCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+
+  void Run() override {
+    auto& context = ctx_->As<LoongArchContext>();
+    auto& param = *param_.get_mutable<operators::MulParam>();
+    // CHECK(context.loongarch_device_context());
+
+    auto* z = param.output;
+
+    auto* x = param.x;
+    auto* y = param.y;
+
+    Tensor x_matrix, y_matrix;
+
+    if (x->dims().size() > 2) {
+      x_matrix = ReshapeToMatrix(*x, param.x_num_col_dims);
+    } else {
+      x_matrix = *x;
+    }
+
+    if (y->dims().size() > 2) {
+      y_matrix = ReshapeToMatrix(*y, param.y_num_col_dims);
+
+    } else {
+      y_matrix = *y;
+    }
+
+    z->template mutable_data<T>();
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+
+    auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+
+    blas.MatMul(x_matrix, y_matrix, z);
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+
+  virtual ~MulCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/mul_compute_test.cc b/lite/kernels/loongarch/mul_compute_test.cc
new file mode 100644
index 00000000000..bbb8e2c3576
--- /dev/null
+++ b/lite/kernels/loongarch/mul_compute_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/mul_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(mul_loongarch, retrive_op) {
+  auto mul = KernelRegistry::Global().Create("mul");
+  ASSERT_FALSE(mul.empty());
+  ASSERT_TRUE(mul.front());
+}
+
+TEST(mul_loongarch, init) {
+  lite::kernels::loongarch::MulCompute<float> mul;
+  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mul.target(), TARGET(kLoongArch));
+}
+
+TEST(mul_loongarch, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{3, 4};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{batch_size, 4};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+  // MulCompute mul;
+  MulCompute<float> mul;
+  operators::MulParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.output = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  mul.SetContext(std::move(ctx));
+  mul.SetParam(param);
+  mul.Run();
+
+  std::vector<float> ref_result = {20, 23, 26, 29};
+
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_result[i], 1e-3);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(mul, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/pool_compute.cc b/lite/kernels/loongarch/pool_compute.cc
new file mode 100644
index 00000000000..4d282cd0030
--- /dev/null
+++ b/lite/kernels/loongarch/pool_compute.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/pool_compute.h"
+
+REGISTER_LITE_KERNEL(pool2d,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::PoolCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/pool_compute.h b/lite/kernels/loongarch/pool_compute.h
new file mode 100644
index 00000000000..360b271ea8b
--- /dev/null
+++ b/lite/kernels/loongarch/pool_compute.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/backends/loongarch/math/pooling.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class PoolCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+  void Run() override {
+    auto& context = ctx_->As<LoongArchContext>();
+    auto& param = *param_.get_mutable<param_t>();
+    if (param.global_pooling) {
+      for (size_t i = 0; i < param.ksize.size(); ++i) {
+        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
+      }
+    }
+    switch (param.ksize.size()) {
+      case 2: {
+        if (param.pooling_type == "max") {
+          paddle::lite::loongarch::math::Pool2dFunctor<
+              lite::TargetType::kLoongArch,
+              paddle::lite::loongarch::math::MaxPool<T>,
+              T>
+              pool2d_forward;
+          paddle::lite::loongarch::math::MaxPool<T> pool_process;
+          pool2d_forward(context,
+                         param.x,
+                         param.ksize,
+                         param.strides,
+                         *param.paddings,
+                         pool_process,
+                         true,
+                         false,
+                         param.output);
+        } else if (param.pooling_type == "avg") {
+          paddle::lite::loongarch::math::Pool2dFunctor<
+              lite::TargetType::kLoongArch,
+              paddle::lite::loongarch::math::AvgPool<T>,
+              T>
+              pool2d_forward;
+          paddle::lite::loongarch::math::AvgPool<T> pool_process;
+          pool2d_forward(context,
+                         param.x,
+                         param.ksize,
+                         param.strides,
+                         *param.paddings,
+                         pool_process,
+                         param.exclusive,
+                         param.adaptive,
+                         param.output);
+        }
+      } break;
+      case 3: {
+      } break;
+    }
+  }
+  virtual ~PoolCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/pool_compute_test.cc b/lite/kernels/loongarch/pool_compute_test.cc
new file mode 100644
index 00000000000..2883150cc82
--- /dev/null
+++ b/lite/kernels/loongarch/pool_compute_test.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/pool_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(pool_loongarch, retrive_op) {
+  auto pool2d = KernelRegistry::Global().Create("pool2d");
+  ASSERT_FALSE(pool2d.empty());
+  ASSERT_TRUE(pool2d.front());
+}
+
+TEST(pool2d_loongarch, init) {
+  PoolCompute<float> pool2d;
+  ASSERT_EQ(pool2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(pool2d.target(), TARGET(kLoongArch));
+}
+
+TEST(pool2d_loongarch, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 4, 4};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+
+  PoolCompute<float> pool2d;
+  operators::PoolParam param;
+
+  param.x = &x;
+  param.output = &out;
+  param.strides = {2, 2};
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.ksize = {2, 2};
+  param.pooling_type = "max";
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  pool2d.SetContext(std::move(ctx));
+  pool2d.SetParam(param);
+  pool2d.Run();
+
+  LOG(INFO) << "output: ";
+  float ref_result[12] = {
+      5., 7., 13., 15., 21., 23., 29., 31., 37., 39., 45., 47.};
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+    EXPECT_NEAR(out_data[i], ref_result[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(pool2d, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/pow_compute.cc b/lite/kernels/loongarch/pow_compute.cc
new file mode 100644
index 00000000000..b6341e03d76
--- /dev/null
+++ b/lite/kernels/loongarch/pow_compute.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/pow_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void PowCompute::Run() {
+  LOG(INFO) << "PowCompute";
+  auto& param = Param<operators::PowParam>();
+  const float* x_data = param.X->data<float>();
+  float* output_data = param.Out->mutable_data<float>();
+  DDim x_dims = param.X->dims();
+  float scale = 1.0;
+  float shift = 0.0;
+  float power = param.factor;
+
+  lite::loongarch::math::power(
+      x_data, output_data, x_dims.production(), scale, shift, power);
+}
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    pow, kLoongArch, kFloat, kNCHW, paddle::lite::kernels::loongarch::PowCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/pow_compute.h b/lite/kernels/loongarch/pow_compute.h
new file mode 100644
index 00000000000..23f8111d5cc
--- /dev/null
+++ b/lite/kernels/loongarch/pow_compute.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/power.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+class PowCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PowParam;
+  void Run() override;
+  virtual ~PowCompute() = default;
+
+#ifdef LITE_WITH_PROFILE
+#define PROFILE_INFO                            \
+  template <>                                   \
+  void PowCompute::SetProfileRuntimeKernelInfo( \
+      paddle::lite::profile::OpCharacter* ch) { \
+    ch->kernel_func_name = kernel_func_name_;   \
+  }
+
+#define KERNEL_FUNC_NAME(kernel_func_name) kernel_func_name_ = kernel_func_name;
+#else
+#define PROFILE_INFO(dtype1, dtype2)
+#define KERNEL_FUNC_NAME(kernel_func_name)
+#endif
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/reduce_compute.cc b/lite/kernels/loongarch/reduce_compute.cc
new file mode 100644
index 00000000000..44532ea4df1
--- /dev/null
+++ b/lite/kernels/loongarch/reduce_compute.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/reduce_compute.h"
+
+namespace loongarch = paddle::lite::kernels::loongarch;
+
+using ReduceMeanFloat32 = loongarch::ReduceCompute<float, loongarch::MeanFunctor>;
+REGISTER_LITE_KERNEL(reduce_mean, kLoongArch, kFloat, kNCHW, ReduceMeanFloat32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+#ifdef LITE_BUILD_EXTRA
+using ReduceSumFloat32 = loongarch::ReduceCompute<float, loongarch::SumFunctor>;
+REGISTER_LITE_KERNEL(reduce_sum, kLoongArch, kFloat, kNCHW, ReduceSumFloat32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+using ReduceSumInt32 = loongarch::ReduceCompute<int, loongarch::SumFunctor>;
+REGISTER_LITE_KERNEL(reduce_sum, kLoongArch, kFloat, kNCHW, ReduceSumInt32, int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+using ReduceSumInt64 = loongarch::ReduceCompute<int64_t, loongarch::SumFunctor>;
+REGISTER_LITE_KERNEL(reduce_sum, kLoongArch, kFloat, kNCHW, ReduceSumInt64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+using ReduceProdFloat32 = loongarch::ReduceCompute<float, loongarch::ProdFunctor>;
+REGISTER_LITE_KERNEL(reduce_prod, kLoongArch, kFloat, kNCHW, ReduceProdFloat32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+using ReduceProdInt32 = loongarch::ReduceCompute<int, loongarch::ProdFunctor>;
+REGISTER_LITE_KERNEL(reduce_prod, kLoongArch, kFloat, kNCHW, ReduceProdInt32, int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+using ReduceProdInt64 = loongarch::ReduceCompute<int64_t, loongarch::ProdFunctor>;
+REGISTER_LITE_KERNEL(reduce_prod, kLoongArch, kFloat, kNCHW, ReduceProdInt64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+using ReduceMaxFloat32 = loongarch::ReduceCompute<float, loongarch::MaxFunctor>;
+REGISTER_LITE_KERNEL(reduce_max, kLoongArch, kFloat, kNCHW, ReduceMaxFloat32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+using ReduceMaxInt32 = loongarch::ReduceCompute<int, loongarch::MaxFunctor>;
+REGISTER_LITE_KERNEL(reduce_max, kLoongArch, kFloat, kNCHW, ReduceMaxInt32, int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+using ReduceMaxInt64 = loongarch::ReduceCompute<int64_t, loongarch::MaxFunctor>;
+REGISTER_LITE_KERNEL(reduce_max, kLoongArch, kFloat, kNCHW, ReduceMaxInt64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+using ReduceMinFloat32 = loongarch::ReduceCompute<float, loongarch::MinFunctor>;
+REGISTER_LITE_KERNEL(reduce_min, kLoongArch, kFloat, kNCHW, ReduceMinFloat32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+using ReduceMinInt32 = loongarch::ReduceCompute<int, loongarch::MinFunctor>;
+REGISTER_LITE_KERNEL(reduce_min, kLoongArch, kFloat, kNCHW, ReduceMinInt32, int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+using ReduceMinInt64 = loongarch::ReduceCompute<int64_t, loongarch::MinFunctor>;
+REGISTER_LITE_KERNEL(reduce_min, kLoongArch, kFloat, kNCHW, ReduceMinInt64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+#endif  // LITE_BUILD_EXTRA
diff --git a/lite/kernels/loongarch/reduce_compute.h b/lite/kernels/loongarch/reduce_compute.h
new file mode 100644
index 00000000000..59fdf79f91c
--- /dev/null
+++ b/lite/kernels/loongarch/reduce_compute.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/reduce_op_function.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+struct SumFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(X* x, Y* y, const Dim& dim) {
+    y->device(lite::fluid::EigenDeviceType<TARGET(kLoongArch)>()) = x->sum(dim);
+  }
+};
+
+struct ProdFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(X* x, Y* y, const Dim& dim) {
+    y->device(lite::fluid::EigenDeviceType<TARGET(kLoongArch)>()) = x->prod(dim);
+  }
+};
+
+struct MeanFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(X* x, Y* y, const Dim& dim) {
+    y->device(lite::fluid::EigenDeviceType<TARGET(kLoongArch)>()) = x->mean(dim);
+  }
+};
+
+struct MaxFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(X* x, Y* y, const Dim& dim) {
+    y->device(lite::fluid::EigenDeviceType<TARGET(kLoongArch)>()) = x->maximum(dim);
+  }
+};
+
+struct MinFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(X* x, Y* y, const Dim& dim) {
+    y->device(lite::fluid::EigenDeviceType<TARGET(kLoongArch)>()) = x->minimum(dim);
+  }
+};
+
+#define HANDLE_DIM(NDIM, RDIM, FUNCTOR)                                \
+  if (ndim == NDIM && rdim == RDIM) {                                  \
+    paddle::lite::kernels::loongarch::                                       \
+        ReduceFunctor<lite::TargetType::kLoongArch, T, NDIM, RDIM, FUNCTOR>( \
+            *x, out, dims, keep_dim);                                  \
+  }
+
+template <typename T, typename Functor>
+class ReduceCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ReduceParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ReduceParam>();
+    auto* x = param.X;
+    auto* out = param.Out;
+    out->template mutable_data<T>();
+    auto x_dims = x->dims();
+
+    const auto& dims = param.dim;
+    bool keep_dim = param.keep_dim;
+    bool reduce_all = param.reduce_all;
+    if (reduce_all || dims.empty() || x_dims.size() == 1 ||
+        x_dims.size() == dims.size()) {
+      // Flatten and reduce 1-D tensor
+      auto x_e = lite::fluid::EigenVector<T>::Flatten(*x);
+      auto out_e = lite::fluid::EigenScalar<T>::From(out);
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      Functor functor;
+      functor(&x_e, &out_e, reduce_dim);
+    } else {
+      int ndim = x_dims.size();
+      int rdim = dims.size();
+      HANDLE_DIM(6, 5, Functor);
+      HANDLE_DIM(6, 4, Functor);
+      HANDLE_DIM(6, 3, Functor);
+      HANDLE_DIM(6, 2, Functor);
+      HANDLE_DIM(6, 1, Functor);
+      HANDLE_DIM(5, 4, Functor);
+      HANDLE_DIM(5, 3, Functor);
+      HANDLE_DIM(5, 2, Functor);
+      HANDLE_DIM(5, 1, Functor);
+      HANDLE_DIM(4, 3, Functor);
+      HANDLE_DIM(4, 2, Functor);
+      HANDLE_DIM(4, 1, Functor);
+      HANDLE_DIM(3, 2, Functor);
+      HANDLE_DIM(3, 1, Functor);
+      HANDLE_DIM(2, 1, Functor);
+    }
+  }
+
+  virtual ~ReduceCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/reduce_op_function.h b/lite/kernels/loongarch/reduce_op_function.h
new file mode 100644
index 00000000000..5dd4f2fbe66
--- /dev/null
+++ b/lite/kernels/loongarch/reduce_op_function.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = lite::fluid::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = lite::fluid::EigenScalar<T, MajorType, IndexType>;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = lite::fluid::EigenVector<T, MajorType, IndexType>;
+
+template <lite::TargetType Target,
+          typename T,
+          size_t D,
+          size_t R_D,
+          typename Functor>
+// const lite::Context<Target>& context,
+void ReduceFunctor(const lite::Tensor& input,
+                   lite::Tensor* output,
+                   const std::vector<int>& dims,
+                   bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+
+  auto reduce_dim = Eigen::array<int, R_D>();
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] < 0) {
+      reduce_dim[i] = x_rank + dims[i];
+    } else {
+      reduce_dim[i] = dims[i];
+    }
+  }
+
+  Functor functor;
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(output);
+    functor(&x, &out, reduce_dim);
+  } else {
+    std::vector<DDim::value_type> out_dims;
+    if (keep_dim) {
+      // Construct the squeezed dims.
+      const int kDelFlag = -2;
+      out_dims = output->dims().Vectorize();
+      for (size_t i = 0; i < dims.size(); ++i) {
+        out_dims[reduce_dim[i]] = kDelFlag;
+      }
+      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                     out_dims.end());
+    }
+    auto out = EigenTensor<T, (D - R_D)>::From(
+        *output, keep_dim ? DDim(out_dims) : output->dims());
+    functor(&x, &out, reduce_dim);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/rnn_compute.cc b/lite/kernels/loongarch/rnn_compute.cc
new file mode 100644
index 00000000000..345a8fb0e9e
--- /dev/null
+++ b/lite/kernels/loongarch/rnn_compute.cc
@@ -0,0 +1,776 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/loongarch/math/rnn.h"
+#include <cmath>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/host/math/split.h"
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/concat_and_split.h"
+#include "lite/kernels/loongarch/rnn_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+#define RUN_RNN_LAYER(x, y, z, w) \
+  RunRnnLayer(&ctx,               \
+              input_temp_holder,  \
+              parameter_lists[x], \
+              init_h_unbind,      \
+              init_c_unbind,      \
+              sequence_length,    \
+              &last_h_unbind,     \
+              &last_c_unbind,     \
+              y,                  \
+              x,                  \
+              &gate_value,        \
+              z,                  \
+              w,                  \
+              mode)
+
+static void reset_parameter_vector(
+    const std::vector<Tensor*>& raw_params_vec,
+    const int& num_layers,
+    const int& gate_num,
+    const bool& is_bidirec,
+    std::vector<std::vector<Tensor>>* params_vec) {
+  // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
+  // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
+  // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
+  const int& direction_num = is_bidirec ? 2 : 1;
+  const int& layer_weight_size = 4 * direction_num;
+  const int& all_weight_size = num_layers * layer_weight_size;
+  const int& bias_start_idx = all_weight_size / 2;
+  for (int i = 0; i < num_layers; i++) {
+    std::vector<Tensor> tensor_list;
+    tensor_list.reserve(layer_weight_size);
+    for (int j = 0; j < layer_weight_size; j++) {
+      Tensor tensor_holder;
+      tensor_list.emplace_back(tensor_holder);
+    }
+    for (int j = 0; j < layer_weight_size; j++) {
+      int k = j % 4;
+      const int& section = j / 4;
+      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
+      if (k >= 2) {
+        tensor_idx += bias_start_idx;
+      }
+      tensor_list[j].ShareDataWith(*raw_params_vec[tensor_idx]);
+    }
+    params_vec->emplace_back(tensor_list);
+  }
+}
+
+static void SwapPoniter(Tensor** a, Tensor** b) {
+  Tensor* c = *a;
+  *a = *b;
+  *b = c;
+}
+
+/******************************************************
+input:
+    ctx:context,
+    input:(3D)time_step, batch, input_size,
+    weight:(2D)hidden_size, input_size,
+    bias_ih,
+    bias_hh,
+    mode:LSTM, GRU
+output:
+    cache_input:(3D)time_step, batch, hidden_size
+*******************************************************/
+static void preprocess(LoongArchContext* ctx,
+                       const Tensor* input,
+                       const Tensor& weight,
+                       const Tensor& bias_ih,
+                       const Tensor& bias_hh,
+                       std::string mode,
+                       Tensor* cache_input) {
+  const int& hidden_size = weight.dims()[0];
+  int time_step = input->dims()[0];
+  int batch = input->dims()[1];
+
+  std::vector<int64_t> cache_input_dim = {time_step, batch, hidden_size};
+  DDim gate_dim;
+  gate_dim.ConstructFrom(cache_input_dim);
+  cache_input->Resize(gate_dim);
+
+  auto* i_data = input->data<float>();
+  auto* w_data = weight.data<float>();
+  auto* o_data = cache_input->mutable_data<float>();
+  auto input_dims = input->dims();
+  auto weight_input_dims = weight.dims();
+  int m = input_dims[0] * input_dims[1];
+  int k = input_dims[2];
+  int n = weight_input_dims[0];
+
+  lite::loongarch::math::Blas<lite::TargetType::kLoongArch> matmul(*ctx);
+  matmul.GEMM<float>(
+      false, true, m, n, k, 1.f, i_data, k, w_data, k, 0.f, o_data, n);
+  lite::loongarch::math::fill_bias_fc(o_data, bias_ih.data<float>(), m, n);
+
+  if ("GRU" == mode) {
+    Tensor bias_tmp_hh;
+    bias_tmp_hh.Resize(bias_hh.dims());
+    auto bias_ptr = bias_tmp_hh.mutable_data<float>();
+    auto bias_src = bias_hh.data<float>();
+    int bias_offt = bias_hh.numel() / 3 * 2;
+    std::memcpy(bias_ptr, bias_src, bias_offt * sizeof(float));
+    std::memset(
+        bias_ptr + bias_offt, 0, (bias_hh.numel() - bias_offt) * sizeof(float));
+    lite::loongarch::math::fill_bias_fc(o_data, bias_tmp_hh.data<float>(), m, n);
+  } else {
+    lite::loongarch::math::fill_bias_fc(o_data, bias_hh.data<float>(), m, n);
+  }
+}
+
+/******************************************************
+input:
+    ctx:context,
+    init_h:(2D),
+    init_c:(2D),
+    mask_tensor:(1D)input->dims()[1],
+    mode:LSTM, GRU
+output:
+    output:(2D)output->dims()[1], output->dims()[2],
+    last_h:(2D),
+    last_c:(2D)
+*******************************************************/
+static void postprocess(LoongArchContext* ctx,
+                        Tensor* output,
+                        const Tensor* init_h,
+                        const Tensor* init_c,
+                        Tensor* last_h,
+                        Tensor* last_c,
+                        const Tensor& mask_tensor,
+                        std::string mode) {
+  Tensor mask_broadcast_1;
+  mask_broadcast_1.Resize(mask_tensor.dims());
+  auto mask_ptr_1 = mask_broadcast_1.mutable_data<float>();
+  auto mask_ptr = mask_tensor.data<float>();
+  auto out_ptr = output->mutable_data<float>();
+  auto cur_h_ptr = last_h->mutable_data<float>();
+  auto pre_h_ptr = init_h->data<float>();
+  int offset = 0;
+
+  // out = out * mask_broadcast
+  // curr_h = out * mask_broadcast + pre_h * (1 - mask_broadcast);
+  for (int i = 0; i < output->dims()[0]; i++) {
+    mask_ptr_1[i] = 1 - mask_ptr[i];
+    for (int j = 0; j < output->dims()[1]; j++) {
+      offset = i * output->dims()[1] + j;
+      out_ptr[offset] *= mask_ptr[i];
+      cur_h_ptr[offset] = out_ptr[offset] + pre_h_ptr[offset] * mask_ptr_1[i];
+    }
+  }
+  if ("LSTM" == mode) {
+    auto pre_c_ptr = init_c->data<float>();
+    auto cur_c_ptr = last_c->mutable_data<float>();
+
+    // curr_c = curr_c * mask_broadcast + pre_c * (1 - mask_broadcast);
+    for (int i = 0; i < output->dims()[0]; i++) {
+      for (int j = 0; j < output->dims()[1]; j++) {
+        offset = i * output->dims()[1] + j;
+        cur_c_ptr[offset] =
+            cur_c_ptr[offset] * mask_ptr[i] + pre_c_ptr[offset] * mask_ptr_1[i];
+      }
+    }
+  }
+}
+
+static DDim get_stride(const DDim& ddim) {
+  DDim strides;
+  strides[ddim.size() - 1] = 1;
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i + 1];
+  }
+  return strides;
+}
+
+template <typename T>
+static void TransposeNormal(const Tensor& in,
+                            Tensor* out,
+                            const std::vector<int>& axis) {
+  const int rank = axis.size();
+  auto in_stride = get_stride(in.dims());
+  auto out_stride = get_stride(out->dims());
+  const T* in_ptr = in.data<T>();
+  T* out_ptr = out->mutable_data<T>();
+
+  auto transpose_helper = [&](int64_t beg, int64_t end) {
+    for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+      int64_t in_idx = 0;
+      int64_t tmp_idx = out_idx;
+      // calculate the input index
+      for (int i = 0; i < rank; ++i) {
+        const int64_t coordinate = tmp_idx / out_stride[i];
+        tmp_idx -= coordinate * out_stride[i];
+        in_idx += coordinate * in_stride[axis[i]];
+      }
+      out_ptr[out_idx] = in_ptr[in_idx];
+    }
+  };
+  transpose_helper(0, out->numel());
+}
+
+/******************************************************
+input:
+    sequence_length,
+    is_reverse
+output:
+    mask_matrix,
+    min_seq_len
+******************************************************/
+static void create_mask_matrix(const Tensor* sequence_length,
+                               Tensor* mask_matrix,
+                               const bool& is_reverse,
+                               int* min_seq_len) {
+  // Tensor to vector<int>
+  std::vector<int> seq_len_vec;
+  seq_len_vec.resize(sequence_length->numel());
+  std::memcpy(&seq_len_vec[0],
+              sequence_length->data<int>(),
+              sequence_length->numel() * sizeof(int));
+
+  const int& table_width = mask_matrix->dims()[0];
+  Tensor temp;
+  DDimLite dims(
+      std::vector<int64_t>{mask_matrix->dims()[1], mask_matrix->dims()[0]});
+  temp.Resize(dims);
+  float* data_temp = temp.mutable_data<float>();
+  std::fill(data_temp, data_temp + mask_matrix->numel(), 1.f);
+  *min_seq_len = table_width;
+  for (unsigned int i = 0; i < seq_len_vec.size(); i++) {
+    // reset the mask matrix
+    *min_seq_len = std::min(seq_len_vec[i], *min_seq_len);
+    if (seq_len_vec[i] == table_width) {
+      continue;
+    }
+    if (is_reverse) {
+      std::fill(data_temp + i * table_width,
+                data_temp + (i + 1) * table_width - seq_len_vec[i],
+                0.f);
+    } else {
+      std::fill(data_temp + i * table_width + seq_len_vec[i],
+                data_temp + (i + 1) * table_width,
+                0.f);
+    }
+  }
+  mask_matrix->mutable_data<float>();
+  std::vector<int> trans_vec;
+  trans_vec.emplace_back(1);
+  trans_vec.emplace_back(0);
+  TransposeNormal<float>(temp, mask_matrix, trans_vec);
+}
+
+static void lstm_cell(LoongArchContext* ctx,
+                      Tensor* input,
+                      Tensor* weight_hh,
+                      Tensor* init_h,
+                      Tensor* init_c,
+                      Tensor* last_h,
+                      Tensor* last_c,
+                      Tensor* last_c_act,
+                      Tensor* output,
+                      const Tensor* bias_hh) {
+  auto h_dims = init_h->dims();
+  auto weight_input_dims = weight_hh->dims();
+  int m = h_dims[0];
+  int k = h_dims[1];
+  int n = weight_input_dims[0];
+  auto i_data = input->data<float>();
+  auto w_data = weight_hh->data<float>();
+  auto h_data = init_h->data<float>();
+
+  Tensor tmp_gate;
+  tmp_gate.Resize(input->dims());
+  auto tmp_data = tmp_gate.mutable_data<float>();
+
+  lite::loongarch::math::Blas<lite::TargetType::kLoongArch> matmul(*ctx);
+  matmul.GEMM<float>(
+      false, true, m, n, k, 1.f, h_data, k, w_data, k, 0.f, tmp_data, n);
+  for (int i = 0; i < input->dims()[0] * input->dims()[1]; i++) {
+    tmp_data[i] += i_data[i];
+  }
+
+  Tensor tmp_init_c;
+  tmp_init_c.Resize(init_c->dims());
+  auto tmp_init_c_data = tmp_init_c.mutable_data<float>();
+  for (int i = 0; i < tmp_init_c.dims()[0] * tmp_init_c.dims()[1]; i++) {
+    tmp_init_c_data[i] = init_c->data<float>()[i];
+  }
+
+  lite::loongarch::math::LstmMetaValue<float> lstm_value;
+  lstm_value.check_ig = nullptr;
+  lstm_value.check_fg = nullptr;
+  lstm_value.check_og = nullptr;
+  lite_api::ActivationType gate_act = lite_api::ActivationType::kSigmoid_v2;
+  lite_api::ActivationType cell_act = lite_api::ActivationType::kTanh_v2;
+  lite_api::ActivationType cand_act = lite_api::ActivationType::kTanh_v2;
+
+  size_t frame_size = init_h->dims()[1];
+  size_t batch_size = init_h->dims()[0];
+  Tensor cell_pre_act;
+  if (last_c_act == nullptr) {
+    cell_pre_act.Resize(init_h->dims());
+    cell_pre_act.mutable_data<float>();
+    last_c_act = &cell_pre_act;
+  }
+
+  lstm_value.prev_state_value = tmp_init_c_data;
+  lstm_value.gate_value = tmp_data;
+  lstm_value.output_value = output->mutable_data<float>();
+  lstm_value.state_value = last_c->mutable_data<float>();
+  lstm_value.state_active_value = last_c_act->mutable_data<float>();
+  float cell_clip = 0.0;
+  lite::loongarch::math::RnnLstmUnitFunctor<float>::compute(lstm_value,
+                                                      frame_size,
+                                                      batch_size,
+                                                      cell_clip,
+                                                      cand_act,
+                                                      gate_act,
+                                                      cell_act,
+                                                      1);
+}
+
+static void gru_cell(LoongArchContext* ctx,
+                     Tensor* input,
+                     Tensor* weight_hh,
+                     Tensor* init_h,
+                     Tensor* init_c,
+                     Tensor* last_h,
+                     Tensor* last_c,
+                     Tensor* last_c_act,
+                     Tensor* output,
+                     const Tensor* bias_hh,
+                     Tensor* weight_hh_gru) {
+  auto h_dims = init_h->dims();
+  auto weight_gru_dims = weight_hh_gru->dims();
+  int m = h_dims[0];
+  int k = h_dims[1];
+  int n = weight_gru_dims[0];
+  auto i_data = input->data<float>();
+  auto w_gru = weight_hh_gru->data<float>();
+  auto h_data = init_h->data<float>();
+
+  Tensor tmp_gate;
+  tmp_gate.Resize(input->dims());
+  auto tmp_data = tmp_gate.mutable_data<float>();
+
+  lite::loongarch::math::Blas<lite::TargetType::kLoongArch> matmul(*ctx);
+  matmul.GEMM<float>(
+      false, true, m, n, k, 1.f, h_data, k, w_gru, k, 0.f, tmp_data, n);
+  for (int i = 0; i < input->dims()[0] * input->dims()[1]; i++) {
+    tmp_data[i] += i_data[i];
+  }
+
+  size_t frame_size = init_h->dims()[1];
+  size_t batch_size = init_h->dims()[0];
+
+  lite::loongarch::math::GRUMetaValue<float> gru_value;
+  gru_value.gate_weight = weight_hh->data<float>();
+  gru_value.state_weight =
+      weight_hh->data<float>() + 2 * frame_size * frame_size;
+  gru_value.reset_bias = bias_hh->data<float>() + 2 * frame_size;
+
+  gru_value.gate_value = tmp_data;
+  gru_value.reset_output_value = last_c->mutable_data<float>();
+  gru_value.output_value = output->mutable_data<float>();
+  gru_value.prev_out_value = init_h->data<float>();
+
+  auto gate_act = lite_api::ActivationType::kSigmoid_v2;
+  auto cand_act = lite_api::ActivationType::kTanh_v2;
+
+  lite::loongarch::math::RnnGruUnitFunctorV2<float>::compute(
+      ctx, gru_value, frame_size, batch_size, cand_act, gate_act);
+}
+
+static void RunRnnLayer(LoongArchContext* ctx,
+                        const Tensor* input,
+                        std::vector<Tensor> vec,
+                        std::vector<Tensor> init_h,
+                        std::vector<Tensor> init_c,
+                        const Tensor* sequence_length,
+                        std::vector<Tensor>* last_h_ptr,
+                        std::vector<Tensor>* last_c_ptr,
+                        Tensor* output,
+                        int layer_idx,
+                        Tensor* gate_value,
+                        bool is_bidirect,
+                        int offset,
+                        std::string mode) {
+  bool is_reverse = false;
+  if (is_bidirect) {
+    layer_idx = 2 * layer_idx + offset;
+    if (offset > 0) {
+      is_reverse = true;
+    }
+  }
+
+  const int& time_step = input->dims()[0];
+  preprocess(ctx,
+             input,
+             vec[0 + offset * 4],
+             vec[2 + offset * 4],
+             vec[3 + offset * 4],
+             mode,
+             gate_value);
+
+  std::vector<Tensor> input_tensors, output_tensors;
+  std::vector<Tensor *> input_tensors_t, output_tensors_t;
+  std::vector<int> stride1, stride2, stride3;
+  input_tensors.resize(gate_value->dims()[0]);
+  output_tensors.resize(output->dims()[0]);
+
+  // unbind
+  for (int i = 0; i < gate_value->dims()[0]; i++) {
+    stride1.push_back(1);
+    int dim1 = gate_value->dims()[1];
+    int dim2 = gate_value->dims()[2];
+    DDimLite dims(std::vector<int64_t>{dim1, dim2});
+    input_tensors[i].Resize(dims);
+    input_tensors_t.push_back(&input_tensors[i]);
+  }
+  for (int i = 0; i < output->dims()[0]; i++) {
+    stride2.push_back(1);
+    int dim1 = output->dims()[1];
+    int dim2 = output->dims()[2];
+    DDimLite dims(std::vector<int64_t>{dim1, dim2});
+    output_tensors[i].Resize(dims);
+    output_tensors_t.push_back(&output_tensors[i]);
+  }
+  lite::host::math::split(
+      gate_value->data<float>(), input_tensors_t, 0, stride1);
+  lite::host::math::split(output->data<float>(), output_tensors_t, 0, stride2);
+  auto sd = output->mutable_data<float>();
+
+  if (is_reverse) {
+    // don't need to reverse input_tensors_t becauese of unuseful
+    std::reverse(input_tensors.begin(), input_tensors.end());
+  }
+  bool has_sequence_length = false;
+  if (sequence_length != nullptr) {
+    has_sequence_length = true;
+  }
+  // unbind
+  Tensor mask_matrix;
+  std::vector<Tensor> mask_vec;
+  std::vector<Tensor*> mask_tensor_list;
+  int mask_min_length = time_step;
+
+  /*
+   to be verifying!
+  */
+  if (has_sequence_length) {
+    mask_matrix.Resize(DDimLite({time_step, input->dims()[1]}));
+    create_mask_matrix(
+        sequence_length, &mask_matrix, is_reverse, &mask_min_length);
+    for (int i = 0; i < time_step; i++) {
+      stride3.push_back(1);
+      DDimLite ddims(std::vector<int64_t>{input->dims()[1]});
+      mask_vec[i].Resize(ddims);
+      mask_tensor_list.push_back(&mask_vec[i]);
+    }
+    lite::host::math::split(
+        mask_matrix.data<float>(), mask_tensor_list, 0, stride3);
+  }
+  if (is_reverse) {
+    mask_min_length = mask_min_length - time_step + 1;
+  }
+
+  bool has_use_last_h_holder = false;
+  const int& reverse_flag = is_reverse ? -1 : 1;
+  bool has_allocate_mem_c = false;
+
+  // define the init_h holder for the swap
+  Tensor init_h_temp;
+  init_h_temp.Resize(init_h[layer_idx].dims());
+  init_h_temp.CopyDataFrom(init_h[layer_idx]);
+  Tensor* init_h_holder = &init_h_temp;
+  Tensor* last_h_holder = nullptr;
+  if (0 < mask_min_length) {
+    last_h_holder = &(output_tensors[0]);
+  } else {
+    last_h_holder = &(*last_h_ptr)[layer_idx];
+    has_use_last_h_holder = true;
+  }
+
+  Tensor* init_c_holder = nullptr;
+  Tensor* init_c_temp_holder = nullptr;
+  Tensor init_c_temp;
+  Tensor* last_c_holder = nullptr;
+  Tensor last_c_temp;
+
+  if ("LSTM" == mode) {
+    last_c_holder = &(*last_c_ptr)[layer_idx];
+    init_c_temp_holder = &init_c[layer_idx];
+  } else if ("GRU" == mode) {
+    // for reset output value
+    last_c_temp.Resize(init_h[layer_idx].dims());
+    last_c_temp.mutable_data<float>();
+    last_c_holder = &last_c_temp;
+  }
+
+  Tensor weight_hh_tmp;  // for gru
+  std::vector<Tensor> weight_hh_tmp_ubind;
+  std::vector<Tensor*> weight_hh_tmp_ubind_t;
+  std::vector<int> stride_w;
+  if ("GRU" == mode) {
+    weight_hh_tmp.Resize(vec[1 + offset * 4].dims());
+    weight_hh_tmp.mutable_data<float>();
+    weight_hh_tmp.CopyDataFrom(vec[1 + offset * 4]);
+    int size = weight_hh_tmp.numel() / 3;
+    std::memset(weight_hh_tmp.mutable_data<float>() + size * 2,
+                0,
+                size * sizeof(float));
+  }
+
+  for (int i = 0; i < time_step; i++) {
+    bool in_mask = (reverse_flag * i) >= mask_min_length;
+    if (i > 0) {
+      if (!has_allocate_mem_c) {
+        if (("LSTM" == mode) || ("GRU" == mode)) {
+          init_c_temp.Resize(init_h[layer_idx].dims());
+          init_c_temp.mutable_data<float>();
+          init_c_holder = &init_c_temp;
+        }
+        has_allocate_mem_c = true;
+      }
+      SwapPoniter(&init_c_holder, &last_c_holder);
+      init_c_temp_holder = init_c_holder;
+    }
+
+    if ("LSTM" == mode) {
+      lstm_cell(ctx,
+                &input_tensors[i],
+                &vec[1 + offset * 4],
+                init_h_holder,
+                init_c_temp_holder,
+                last_h_holder,
+                last_c_holder,
+                nullptr,
+                &output_tensors[i],
+                &vec[3 + offset * 4]);
+    } else if ("GRU" == mode) {
+      gru_cell(ctx,
+               &input_tensors[i],
+               &vec[1 + offset * 4],
+               init_h_holder,
+               init_c_temp_holder,
+               last_h_holder,
+               last_c_holder,
+               nullptr,
+               &output_tensors[i],
+               &vec[3 + offset * 4],
+               &weight_hh_tmp);
+    }
+
+    /*
+     to be verifying!
+    */
+    if (in_mask) {
+      postprocess(ctx,
+                  &output_tensors[i],
+                  init_h_holder,
+                  init_c_temp_holder,
+                  last_h_holder,
+                  last_c_holder,
+                  mask_vec[i],
+                  mode);
+    }
+
+    // prepare next step
+    if (i + 1 < time_step) {
+      bool next_step_mask = (reverse_flag * (i + 1)) >= mask_min_length;
+      if (next_step_mask) {
+        if (!has_use_last_h_holder) {
+          init_h_holder = &(*last_h_ptr)[layer_idx];
+        }
+      } else {
+        init_h_holder = &(output_tensors[i + 1]);
+      }
+      SwapPoniter(&init_h_holder, &last_h_holder);
+    }
+  }
+
+  // unbind vector and source are not in the same address, need copy
+  // different from paddle
+  if (is_reverse) {
+    std::reverse(output_tensors.begin(), output_tensors.end());
+  }
+  for (int i = 0; i < time_step; i++) {
+    int st = output_tensors[i].dims()[0] * output_tensors[i].dims()[1];
+    for (int j = 0; j < st; j++) {
+      sd[i * st + j] = output_tensors[i].data<float>()[j];
+    }
+  }
+
+  if (has_sequence_length) {
+    if (last_h_holder != &(*last_h_ptr)[layer_idx]) {
+      (*last_h_ptr)[layer_idx].CopyDataFrom(*last_h_holder);
+    }
+  } else {
+    (*last_h_ptr)[layer_idx].CopyDataFrom(output_tensors[time_step - 1]);
+  }
+  if ((0 == (time_step % 2)) && ("LSTM" == mode)) {
+    (*last_c_ptr)[layer_idx].CopyDataFrom(*last_c_holder);
+  }
+}
+
+void RnnCompute::Run() {
+  auto& param = this->Param<operators::RnnParam>();
+  auto& ctx = this->ctx_->As<LoongArchContext>();
+  param.Out->mutable_data<float>();
+  std::string mode = param.mode;
+  auto input = param.Input;
+  auto weight_list = param.WeightList;
+  auto pre_state = param.PreState;
+  auto state = param.State;
+  auto output = param.Out;
+  bool is_bidirec = param.is_bidirec;
+  int num_layers = param.num_layers;
+  const Tensor* sequence_length = param.SequenceLength;
+  int gate_num = 0;
+
+  if ("LSTM" == mode) {
+    gate_num = 4;
+  } else if ("GRU" == mode) {
+    gate_num = 3;
+  } else {
+    LOG(FATAL) << "LoongArch RNN ERROR: unsupport mode except gru and lstm,"
+                  " present mode is "
+               << mode;
+    return;
+  }
+
+  state[0]->mutable_data<float>();
+  if ("LSTM" == mode) {
+    state[1]->mutable_data<float>();
+  }
+  // reset the parameter to sorted order and allocate the memory
+  std::vector<std::vector<Tensor>> parameter_lists;
+  parameter_lists.reserve(num_layers);
+  reset_parameter_vector(
+      weight_list, num_layers, gate_num, is_bidirec, &parameter_lists);
+  Tensor* input_holder;
+  Tensor* output_holder = output;
+  Tensor temp, gate_value;
+  bool has_allocate_mem = false;
+
+  std::vector<Tensor> init_h_unbind, init_c_unbind, last_h_unbind,
+      last_c_unbind;
+  std::vector<Tensor *> init_h_unbind_t, init_c_unbind_t, last_h_unbind_t,
+      last_c_unbind_t;
+  init_h_unbind.resize(pre_state[0]->dims()[0]);
+  last_h_unbind.resize(state[0]->dims()[0]);
+
+  if ("LSTM" == mode) {
+    init_c_unbind.resize(pre_state[1]->dims()[0]);
+    last_c_unbind.resize(state[1]->dims()[0]);
+  }
+  std::vector<int> stride1, stride2;
+
+  // unbind
+  for (int i = 0; i < pre_state[0]->dims()[0]; i++) {
+    stride1.push_back(1);
+    int dim1 = pre_state[0]->dims()[1];
+    int dim2 = pre_state[0]->dims()[2];
+    DDimLite dims(std::vector<int64_t>{dim1, dim2});
+    init_h_unbind[i].Resize(dims);
+    last_h_unbind[i].Resize(dims);
+    init_h_unbind_t.push_back(&init_h_unbind[i]);
+    last_h_unbind_t.push_back(&last_h_unbind[i]);
+  }
+  lite::host::math::split(
+      pre_state[0]->data<float>(), init_h_unbind_t, 0, stride1);
+  lite::host::math::split(state[0]->data<float>(), last_h_unbind_t, 0, stride1);
+
+  if ("LSTM" == mode) {
+    for (int i = 0; i < pre_state[1]->dims()[0]; i++) {
+      stride2.push_back(1);
+      int dim1 = pre_state[1]->dims()[1];
+      int dim2 = pre_state[1]->dims()[2];
+      DDimLite dims(std::vector<int64_t>{dim1, dim2});
+      init_c_unbind[i].Resize(dims);
+      last_c_unbind[i].Resize(dims);
+      init_c_unbind_t.push_back(&init_c_unbind[i]);
+      last_c_unbind_t.push_back(&last_c_unbind[i]);
+    }
+    lite::host::math::split(
+        pre_state[1]->data<float>(), init_c_unbind_t, 0, stride2);
+    lite::host::math::split(
+        state[1]->data<float>(), last_c_unbind_t, 0, stride2);
+  }
+
+  std::vector<Tensor> output_vec(2);
+  int time_step = input->dims()[0];
+  int batch_size = input->dims()[1];
+  int hidden_size = output->dims()[2];
+  if (is_bidirec) {
+    for (int i = 0; i < 2; ++i) {
+      output_vec[i].Resize({time_step, batch_size, hidden_size / 2});
+      output_vec[i].mutable_data<float>();
+    }
+  }
+
+  for (int i = 0; i < num_layers; i++) {
+    if (i > 0) {
+      if (!has_allocate_mem) {
+        temp.Resize(output->dims());
+        temp.mutable_data<float>();
+        input_holder = &temp;
+        has_allocate_mem = true;
+      }
+      SwapPoniter(&output_holder, &input_holder);
+    }
+
+    const Tensor* input_temp_holder = input;
+    if (i > 0) {
+      input_temp_holder = input_holder;
+    }
+
+    if (is_bidirec) {
+      RUN_RNN_LAYER(i, &output_vec[0], true, 0);
+      RUN_RNN_LAYER(i, &output_vec[1], true, 1);
+      lite::loongarch::math::ConcatFunctor<lite::TargetType::kLoongArch, float> concat_loongarch;
+      concat_loongarch(ctx, output_vec, 2, output_holder);
+    } else {
+      RUN_RNN_LAYER(i, output_holder, false, 0);
+    }
+  }
+  // output_holder != output
+  if (num_layers % 2 == 0) {
+    output->CopyDataFrom(*output_holder);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    rnn, kLoongArch, kFloat, kNCHW, paddle::lite::kernels::loongarch::RnnCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("WeightList", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("PreState", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("SequenceLength", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("DropoutState", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Reserve", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("State", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/rnn_compute.h b/lite/kernels/loongarch/rnn_compute.h
new file mode 100644
index 00000000000..8618add7469
--- /dev/null
+++ b/lite/kernels/loongarch/rnn_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+class RnnCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~RnnCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/scale_compute.cc b/lite/kernels/loongarch/scale_compute.cc
new file mode 100644
index 00000000000..459bfcc08be
--- /dev/null
+++ b/lite/kernels/loongarch/scale_compute.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/scale_compute.h"
+
+REGISTER_LITE_KERNEL(scale,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ScaleCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(scale,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ScaleCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(scale,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::ScaleCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/scale_compute.h b/lite/kernels/loongarch/scale_compute.h
new file mode 100644
index 00000000000..88ffdb556dd
--- /dev/null
+++ b/lite/kernels/loongarch/scale_compute.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/relu_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+void scale_compute(
+    const T* x, T* out, int size, T scale, T bias, bool bias_before) {
+  if (bias_before) bias *= scale;
+  for (int i = 0; i < size; i++) {
+    out[i] = x[i] * scale + bias;
+  }
+}
+
+template <typename T>
+class ScaleCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    T scale = static_cast<T>(param.scale);
+    T bias = static_cast<T>(param.bias);
+    scale_compute(param.x->template data<T>(),
+                  param.output->template mutable_data<T>(),
+                  param.x->dims().production(),
+                  scale,
+                  bias,
+                  !param.bias_after_scale);
+  }
+
+  virtual ~ScaleCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/scale_compute_test.cc b/lite/kernels/loongarch/scale_compute_test.cc
new file mode 100644
index 00000000000..f91610cce88
--- /dev/null
+++ b/lite/kernels/loongarch/scale_compute_test.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/scale_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(scale_loongarch, retrive_op) {
+  auto scale = KernelRegistry::Global().Create("scale");
+  ASSERT_FALSE(scale.empty());
+  ASSERT_TRUE(scale.front());
+}
+
+TEST(scale_loongarch, init) {
+  ScaleCompute<float> scale;
+  ASSERT_EQ(scale.precision(), PRECISION(kFloat));
+  ASSERT_EQ(scale.target(), TARGET(kLoongArch));
+}
+
+TEST(scale_loongarch, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // ScaleCompute scale;
+  ScaleCompute<float> scale;
+  operators::ScaleParam param;
+
+  param.x = &x;
+  param.scale = 0.5;
+  param.bias = 0;
+  param.output = &out;
+
+  scale.SetParam(param);
+  scale.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(scale, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/search_aligned_mat_mul_compute.cc b/lite/kernels/loongarch/search_aligned_mat_mul_compute.cc
new file mode 100644
index 00000000000..6e5638e822a
--- /dev/null
+++ b/lite/kernels/loongarch/search_aligned_mat_mul_compute.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/search_aligned_mat_mul_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_aligned_mat_mul,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::SearchAlignedMatMulCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/search_aligned_mat_mul_compute.h b/lite/kernels/loongarch/search_aligned_mat_mul_compute.h
new file mode 100644
index 00000000000..ba07838676f
--- /dev/null
+++ b/lite/kernels/loongarch/search_aligned_mat_mul_compute.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SearchAlignedMatMulCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void Run() override {
+    auto& context = ctx_->As<LoongArchContext>();
+    auto& param = *param_.get_mutable<operators::MatMulParam>();
+
+    auto x = param.X;
+    auto y = param.Y;
+    auto out = param.Out;
+    bool x_transpose = param.transpose_X;
+    bool y_transpose = param.transpose_Y;
+    float alpha = param.alpha;
+    const auto x_dims = x->dims();
+    const auto y_dims = y->dims();
+    const auto& x_lod = x->lod();
+    const auto& y_lod = y->lod();
+    const auto& x_lod_0 = x_lod[0];
+    const auto& y_lod_0 = y_lod[0];
+
+    int seq_num = x_lod_0.size() - 1;
+    int x_inner_size = x_dims[1];
+    int y_inner_size = y_dims[1];
+    int x_batch_size = x_lod_0[1];
+    int y_batch_size = y_lod_0[1];
+    int M = x_transpose ? x_inner_size : x_batch_size;
+    int N = y_transpose ? y_batch_size : y_inner_size;
+    int X_K = x_transpose ? x_batch_size : x_inner_size;
+    int Y_K = y_transpose ? y_inner_size : y_batch_size;
+    CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+    int K = X_K;
+
+    lite::loongarch::math::MatDescriptor mat_dim_a;
+    mat_dim_a.height_ = M;
+    mat_dim_a.width_ = K;
+    mat_dim_a.stride_ = x_batch_size * x_inner_size;
+    mat_dim_a.batch_size_ = seq_num;
+    mat_dim_a.trans_ = x_transpose;
+    lite::loongarch::math::MatDescriptor mat_dim_b;
+    mat_dim_b.height_ = K;
+    mat_dim_b.width_ = N;
+    mat_dim_b.stride_ = y_batch_size * y_inner_size;
+    mat_dim_b.batch_size_ = seq_num;
+    mat_dim_b.trans_ = y_transpose;
+    auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    blas.MatMul(*x, mat_dim_a, *y, mat_dim_b, static_cast<T>(alpha), out, T(0));
+  }
+
+  virtual ~SearchAlignedMatMulCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/search_fc_compute.cc b/lite/kernels/loongarch/search_fc_compute.cc
new file mode 100644
index 00000000000..e445dd09afd
--- /dev/null
+++ b/lite/kernels/loongarch/search_fc_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/search_fc_compute.h"
+
+REGISTER_LITE_KERNEL(search_fc,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SearchFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/search_fc_compute.h b/lite/kernels/loongarch/search_fc_compute.h
new file mode 100644
index 00000000000..624bd80b2b1
--- /dev/null
+++ b/lite/kernels/loongarch/search_fc_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/search_fc.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SearchFcCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchFcParam;
+  void Run() override {
+    auto& context = ctx_->As<LoongArchContext>();
+    auto& param = *param_.get_mutable<param_t>();
+
+    param.Out->Resize({param.X->dims()[0], param.out_size});
+    lite::loongarch::math::SearchFcFunctor<lite::TargetType::kLoongArch, T> search_fc;
+    search_fc(context, *param.X, *param.W, *param.b, param.Out, param.out_size);
+  }
+  virtual ~SearchFcCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/search_fc_compute_test.cc b/lite/kernels/loongarch/search_fc_compute_test.cc
new file mode 100644
index 00000000000..9a4b8a8fb4d
--- /dev/null
+++ b/lite/kernels/loongarch/search_fc_compute_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/search_fc_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void fc_cpu_base(const lite::Tensor* X,
+                 const lite::Tensor* W,
+                 const lite::Tensor* b,
+                 int out_size,
+                 lite::Tensor* Out) {
+  const float* data_in = X->data<float>();
+  const float* bias = b->data<float>();
+  const float* weights = W->data<float>();
+  float* data_out = Out->mutable_data<float>();
+  int out_rows = X->dims()[0];
+  int in_cols = X->numel() / out_rows;
+  int out_cols = W->numel() / in_cols;
+  int index_out;
+
+  for (int i = 0; i < out_rows; i++) {
+    for (int j = 0; j < out_cols; j++) {
+      index_out = i * out_cols + j;
+      data_out[index_out] = bias ? bias[j] : 0;
+
+      for (int k = 0; k < in_cols; k++) {
+        data_out[index_out] +=
+            data_in[i * in_cols + k] * weights[j * in_cols + k];
+      }
+    }
+  }
+}
+
+TEST(search_fc_loongarch, retrive_op) {
+  auto search_fc = KernelRegistry::Global().Create("search_fc");
+  ASSERT_FALSE(search_fc.empty());
+  ASSERT_TRUE(search_fc.front());
+}
+
+TEST(search_fc_loongarch, init) {
+  SearchFcCompute<float> search_fc;
+  ASSERT_EQ(search_fc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(search_fc.target(), TARGET(kLoongArch));
+}
+
+TEST(search_fc_loongarch, run_test) {
+  lite::Tensor x, w, b, out;
+  lite::Tensor out_ref;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  std::vector<int64_t> x_shape{1, 4};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  w.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{3};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 3};
+  out.Resize(lite::DDim(out_shape));
+  out_ref.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto w_data = w.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto out_data_ref = out_ref.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < w.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+
+  fc_cpu_base(&x, &w, &b, 3, &out_ref);
+
+  SearchFcCompute<float> fc;
+  operators::SearchFcParam param;
+  param.X = &x;
+  param.W = &w;
+  param.b = &b;
+  param.Out = &out;
+  param.out_size = 3;
+  fc.SetParam(param);
+  fc.SetContext(std::move(ctx));
+  fc.Run();
+
+  VLOG(3) << "output vs ref";
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_data_ref[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_fc, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/search_grnn_compute.cc b/lite/kernels/loongarch/search_grnn_compute.cc
new file mode 100644
index 00000000000..7ba438a64cc
--- /dev/null
+++ b/lite/kernels/loongarch/search_grnn_compute.cc
@@ -0,0 +1,332 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/search_grnn_compute.h"
+#include <algorithm>
+#include <vector>
+#include "lite/backends/loongarch/math/blas.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+T sigmoid(T z) {
+  return 1 / (1 + std::exp(-z));
+}
+
+template <typename T>
+void CallGemm(const lite::loongarch::math::BlasT<TARGET(kLoongArch), T>& blas,
+              const CBLAS_TRANSPOSE TransA,
+              const CBLAS_TRANSPOSE TransB,
+              const int M,
+              const int N,
+              const int K,
+              const T alpha,
+              const T* A,
+              const T* B,
+              const T beta,
+              T* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::PrepareLayout(const Tensor* input_blob) {
+  auto& param = this->Param<param_t>();
+  auto* _idx_sorted_by_width = param.idx_sorted_by_width;
+  auto* _layout_input = param.layout_input;
+  auto* _input = input_blob;
+
+  // usually total length
+  int dim0 = _input->dims()[0];
+  // if it is id only sequence
+  int dim1 = 1;
+  // if its a embedding like sequence (dim1 would be embedding_size)
+  if (_input->dims().size() > 1) {
+    dim1 = _input->dims()[1];
+  }
+
+  int batch = _input->lod()[0].size() - 1;
+  auto& offset = _input->lod()[0];
+
+  Tensor _width;
+  _width.Resize({batch});
+  _idx_sorted_by_width->Resize({batch});
+  int* width_data = _width.template mutable_data<int>();
+  int* idx_sorted_by_width_data =
+      _idx_sorted_by_width->template mutable_data<int>();
+  // sort sequence by width (descending) and find the largest width in the
+  // batch
+  for (int i = 0; i < batch; i++) {
+    width_data[i] = offset[i + 1] - offset[i];
+    idx_sorted_by_width_data[i] = i;
+  }
+  std::stable_sort(idx_sorted_by_width_data,
+                   idx_sorted_by_width_data + batch,
+                   [&_width](int a, int b) {
+                     return _width.template data<int>()[a] >
+                            _width.template data<int>()[b];
+                   });
+  int max_width = width_data[idx_sorted_by_width_data[0]];
+
+  // start of reorganizing the input
+  std::vector<uint64_t> new_offset;
+  new_offset.resize(max_width + 1);
+
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width_data[idx_sorted_by_width_data[k]] > last_width) {
+        sub_row = width_data[idx_sorted_by_width_data[k]] - last_width;
+        sub_col = k + 1;
+
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width_data[idx_sorted_by_width_data[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+
+  // copying to the reorganized buffer
+  if (_input->dims().size() == 1) {
+    // _layout_input.reshape_batch_sequence({dim0}, new_offset);
+    LOG(FATAL) << "_input->dims().size() = 1, error.";
+  } else {
+    // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset);
+    LoD new_lod;
+    new_lod.push_back(new_offset);
+    _layout_input->set_lod(new_lod);
+    _layout_input->Resize({dim0, dim1});
+  }
+
+  auto* new_emb = _layout_input->template mutable_data<T>();
+  for (int i = 0; i < max_width; i++) {
+    int w = new_offset[i + 1] - new_offset[i];
+    auto* emb_start = new_emb + dim1 * new_offset[i];
+    for (int j = 0; j < w; ++j) {
+      memcpy(emb_start + dim1 * j,
+             _input->template data<T>() +
+                 dim1 * offset[idx_sorted_by_width_data[j]] + dim1 * i,
+             dim1 * sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::CopyBack(T* from, T* to, int step) {
+  auto& param = this->Param<param_t>();
+  auto* _input = param.x;
+  auto* _layout_input = param.layout_input;
+  auto* _idx_sorted_by_width = param.idx_sorted_by_width;
+
+  const auto& offset = _input->lod()[0];
+  const auto& new_offset = _layout_input->lod()[0];
+  const auto* idx_sorted_by_width_data =
+      _idx_sorted_by_width->template data<int>();
+  for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) {
+    int w = new_offset[i + 1] - new_offset[i];
+    for (int j = 0; j < w; j++) {
+      memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i),
+             from + (new_offset[i] + j) * step,
+             step * sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::Run() {
+  auto& context = ctx_->As<LoongArchContext>();
+  auto& param = this->Param<param_t>();
+  auto* bottom = param.x;
+  auto* wi = param.wi;
+  auto* wh = param.wh;
+  auto* top = param.out;
+  auto* _buffer = param.tmp_buffer;
+  int _cap_h = param.num_hidden;
+  int _cap_e = param.num_input;
+
+  int _cap_l = bottom->dims()[0];
+  int batch = bottom->lod()[0].size() - 1;
+
+  const auto& offset = bottom->lod()[0];
+  LoD top_lod;
+  top_lod.push_back(offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{_cap_l, _cap_h};
+  top->Resize(top_dims_vec);
+  auto* top_hidden = top->template mutable_data<T>();
+
+  const auto* dense_e2h = wi->template data<T>();
+  const auto* dense_h2h = wh->template data<T>();
+
+  const auto* e2h = dense_e2h;
+  const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h;
+  const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h;
+  const auto* h2h = dense_h2h;
+  const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h;
+  const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h;
+
+  PrepareLayout(bottom);
+
+  auto* _layout_input = param.layout_input;
+  auto* new_emb = _layout_input->template mutable_data<T>();
+  const auto& new_offset = _layout_input->lod()[0];
+  int max_width = _layout_input->lod()[0].size() - 1;
+
+  // this buffer is used for book keeping info which will be used in bp
+  // buffer also needed in bp, so make it larger
+  _buffer->Resize({20, _cap_l, _cap_h});
+  auto* buffer_data = _buffer->template mutable_data<T>();
+  auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h;
+  auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h;
+  auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h;
+  auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h;
+  auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h;
+  auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h;
+  auto* r = buffer_data + 6 * _cap_l * _cap_h;
+  auto* z = buffer_data + 7 * _cap_l * _cap_h;
+  auto* tilde = buffer_data + 8 * _cap_l * _cap_h;
+  // the internal hidden
+  auto* hidden = buffer_data + 19 * _cap_l * _cap_h;
+
+  auto blas = lite::loongarch::math::GetBlas<TARGET(kLoongArch), T>(context);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2h,
+           0.0f,
+           w_x_e);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2hr,
+           0.0f,
+           wr_x_e);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2hz,
+           0.0f,
+           wz_x_e);
+
+  // precompute hidden0
+  for (int i = 0; i < batch * _cap_h; i++) {
+    tilde[i] = std::tanh(w_x_e[i]);
+    z[i] = sigmoid<T>(wz_x_e[i]);
+    hidden[i] = (1. - z[i]) * tilde[i];
+  }
+
+  // recurrence
+  for (int i = 1; i < max_width; i++) {
+    int w_tm1 = new_offset[i] - new_offset[i - 1];
+    int w = new_offset[i + 1] - new_offset[i];
+
+    // precompute hidden i-1 to hidden i
+    auto* htm1 = hidden + new_offset[i - 1] * _cap_h;
+
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2h,
+             0.0f,
+             u_x_h + new_offset[i] * _cap_h);
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2hr,
+             0.0f,
+             ur_x_h + new_offset[i] * _cap_h);
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2hz,
+             0.0f,
+             uz_x_h + new_offset[i] * _cap_h);
+
+    // compute the gate and hidden
+    for (size_t j = new_offset[i] * _cap_h; j < (new_offset[i] + w) * _cap_h;
+         j++) {
+      r[j] = sigmoid(wr_x_e[j] + ur_x_h[j]);
+      z[j] = sigmoid(wz_x_e[j] + uz_x_h[j]);
+      tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]);
+      hidden[j] = z[j] * hidden[j - _cap_h * w_tm1] + (1.0 - z[j]) * tilde[j];
+    }
+  }
+
+  CopyBack(hidden, top_hidden, _cap_h);
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_grnn,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SearchGrnnCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("idx_sorted_by_width",
+                {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/search_grnn_compute.h b/lite/kernels/loongarch/search_grnn_compute.h
new file mode 100644
index 00000000000..2d457d3b808
--- /dev/null
+++ b/lite/kernels/loongarch/search_grnn_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SearchGrnnCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGrnnParam;
+
+  void Run() override;
+
+  virtual ~SearchGrnnCompute() = default;
+
+ private:
+  void PrepareLayout(const Tensor* input);
+  void CopyBack(T* from, T* to, int step);
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/search_grnn_compute_test.cc b/lite/kernels/loongarch/search_grnn_compute_test.cc
new file mode 100644
index 00000000000..cd8e248a51c
--- /dev/null
+++ b/lite/kernels/loongarch/search_grnn_compute_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/search_grnn_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(search_grnn_loongarch, retrive_op) {
+  auto kernel = KernelRegistry::Global().Create("search_grnn");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(search_grnn_loongarch, init) {
+  SearchGrnnCompute<float> ssdc;
+  ASSERT_EQ(ssdc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(ssdc.target(), TARGET(kLoongArch));
+}
+
+TEST(search_grnn_loongarch, run_test) {
+  int num_input = 128;
+  int num_hidden = 128;
+  int num_batch = 3;
+  lite::Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer;
+  x.Resize({num_batch, num_input});
+  wi.Resize({3, num_hidden, num_input});
+  wh.Resize({3, num_hidden, num_hidden});
+  // out.Resize({num_batch, num_hidden});
+  LoD x_lod{};
+  x_lod.push_back({0, 1, 3});
+  x.set_lod(x_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* wi_data = wi.mutable_data<float>();
+  for (int64_t i = 0; i < wi.numel(); i++) {
+    wi_data[i] = static_cast<float>(i);
+  }
+  auto* wh_data = wh.mutable_data<float>();
+  for (int64_t i = 0; i < wh.numel(); i++) {
+    wh_data[i] = static_cast<float>(i);
+  }
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+
+  operators::SearchGrnnParam param;
+  param.x = &x;
+  param.wi = &wi;
+  param.wh = &wh;
+  param.out = &out;
+  param.idx_sorted_by_width = &idx_sorted_by_width;
+  param.layout_input = &layout_input;
+  param.tmp_buffer = &tmp_buffer;
+  param.num_input = num_input;
+  param.num_hidden = num_hidden;
+
+  SearchGrnnCompute<float> sgc;
+  sgc.SetContext(std::move(ctx));
+  sgc.SetParam(param);
+  sgc.Run();
+
+  // std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  auto* out_data = out.mutable_data<float>();
+  LOG(INFO) << out.numel();
+  for (int i = 0; i < out.numel(); i++) {
+    // EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_grnn, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/search_group_padding_compute.cc b/lite/kernels/loongarch/search_group_padding_compute.cc
new file mode 100644
index 00000000000..501cde0a8a7
--- /dev/null
+++ b/lite/kernels/loongarch/search_group_padding_compute.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/search_group_padding_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_group_padding,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::SearchGroupPaddingCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out_emb_padding", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out_new", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out_padding", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/search_group_padding_compute.h b/lite/kernels/loongarch/search_group_padding_compute.h
new file mode 100644
index 00000000000..695a2b492cf
--- /dev/null
+++ b/lite/kernels/loongarch/search_group_padding_compute.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SearchGroupPaddingCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGroupPaddingParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SearchGroupPaddingParam>();
+
+    auto* bottom0 = param.x;
+    auto* top0 = param.out_emb_padding;
+    auto* top1 = param.out_new;
+    auto* top2 = param.out_padding;
+
+    int _pad_id = param.pad_id;
+
+    int batch = bottom0->lod()[0].size() - 1;
+    int dim0 = bottom0->dims()[0];
+    int dim1 = bottom0->dims()[1];
+
+    const auto offset = bottom0->lod()[0];
+    int max_seq = 0;
+    for (int i = 0; i < batch; ++i) {
+      if (offset[i + 1] - offset[i] > max_seq) {
+        max_seq = offset[i + 1] - offset[i];
+      }
+    }
+
+    std::vector<uint64_t> new_offset;
+    new_offset.resize(batch + 1);
+    for (int i = 0; i < batch + 1; ++i) {
+      new_offset[i] = i * max_seq;
+    }
+
+    // for padding data
+    lite::LoD top0_lod;
+    top0_lod.push_back(new_offset);
+    top0->set_lod(top0_lod);
+    top0->Resize({batch * max_seq, dim1});
+    // for origin input id
+    // already set by ShareLoD in InferShape
+    lite::LoD top1_lod;
+    top1_lod.push_back(offset);
+    top1->set_lod(top1_lod);
+    top1->Resize({dim0, 1});
+    memset(top1->template mutable_data<T>(),
+           0,
+           top1->dims()[0] * top1->dims()[1] * sizeof(T));
+    // for padding input id
+    lite::LoD top2_lod;
+    top2_lod.push_back(new_offset);
+    top2->set_lod(top2_lod);
+    top2->Resize({batch * max_seq, 1});
+    // copy data
+    const auto* bottom_data = bottom0->template data<T>();
+    auto* top_data = top0->template mutable_data<T>();
+    auto* top_padding_input_data = top2->template mutable_data<T>();
+    for (int i = 0; i < batch; i++) {
+      const int copy_step = offset[i + 1] - offset[i];
+      const int start = i * max_seq;
+      memcpy(top_data + start * dim1,
+             bottom_data + offset[i] * dim1,
+             copy_step * dim1 * sizeof(T));
+      memset(top_data + (start + copy_step) * dim1,
+             0,
+             (max_seq - copy_step) * dim1 * sizeof(T));
+      // for padding input id
+      memset(top_padding_input_data + start, 0, copy_step * sizeof(T));
+      for (int j = start + copy_step; j < start + max_seq; j++) {
+        top_padding_input_data[j] = static_cast<T>(_pad_id);
+      }
+    }
+  }
+
+  virtual ~SearchGroupPaddingCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/search_group_padding_compute_test.cc b/lite/kernels/loongarch/search_group_padding_compute_test.cc
new file mode 100644
index 00000000000..0c5489443e5
--- /dev/null
+++ b/lite/kernels/loongarch/search_group_padding_compute_test.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/search_group_padding_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(search_group_padding_loongarch, retrieve_op) {
+  auto search_group_padding =
+      KernelRegistry::Global().Create("search_group_padding");
+  ASSERT_FALSE(search_group_padding.empty());
+  ASSERT_TRUE(search_group_padding.front());
+}
+
+TEST(search_group_padding_loongarch, init) {
+  SearchGroupPaddingCompute<float> search_group_padding;
+  ASSERT_EQ(search_group_padding.precision(), PRECISION(kFloat));
+  ASSERT_EQ(search_group_padding.target(), TARGET(kLoongArch));
+}
+
+TEST(search_group_padding_loongarch, run_test) {
+  lite::Tensor x, out_emb_padding, out_new, out_padding;
+  x.Resize({2, 3});
+  out_emb_padding.Resize({-1, 3});
+  out_new.Resize({2, 1});
+  out_padding.Resize({-1, 1});
+  LoD x_lod{};
+  x_lod.push_back({0, 1});
+  x.set_lod(x_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  SearchGroupPaddingCompute<float> sgp_kernel;
+  operators::SearchGroupPaddingParam param;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  sgp_kernel.SetContext(std::move(ctx));
+
+  param.x = &x;
+  param.out_emb_padding = &out_emb_padding;
+  param.out_new = &out_new;
+  param.out_padding = &out_padding;
+
+  sgp_kernel.SetParam(param);
+  sgp_kernel.Run();
+
+  std::vector<float> out_emb_padding_ref = {0, 1, 2};
+  std::vector<float> out_new_ref = {0, 0};
+  std::vector<float> out_padding_ref = {0};
+  auto* out_emb_padding_data = out_emb_padding.mutable_data<float>();
+  auto* out_new_data = out_new.mutable_data<float>();
+  auto* out_padding_data = out_padding.mutable_data<float>();
+  for (int i = 0; i < out_emb_padding.dims().production(); i++) {
+    EXPECT_NEAR(out_emb_padding_data[i], out_emb_padding_ref[i], 1e-5);
+  }
+  for (int i = 0; i < out_new.dims().production(); i++) {
+    EXPECT_NEAR(out_new_data[i], out_new_ref[i], 1e-5);
+  }
+  for (int i = 0; i < out_padding.dims().production(); i++) {
+    EXPECT_NEAR(out_padding_data[i], out_padding_ref[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_group_padding, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/search_seq_depadding_compute.cc b/lite/kernels/loongarch/search_seq_depadding_compute.cc
new file mode 100644
index 00000000000..3907cb9a782
--- /dev/null
+++ b/lite/kernels/loongarch/search_seq_depadding_compute.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/search_seq_depadding_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+void SearchSeqDepaddingCompute<T>::Run() {
+  auto& param = this->Param<param_t>();
+  auto* pad = param.pad;
+  auto* src = param.src;
+  auto* out = param.out;
+
+  const int pad_batch = pad->lod()[0].size() - 1;
+  const int src_batch = src->lod()[0].size() - 1;
+  if (pad_batch % src_batch != 0) {
+    LOG(FATAL) << "Mismatch batch size.";
+  }
+
+  const auto& pad_offset = pad->lod()[0];
+  const int pad_cap_e = pad->dims()[1];
+  const auto& src_offset = src->lod()[0];
+  const int src_cap_l = src->dims()[0];
+
+  LoD out_lod;
+  out_lod.push_back(src_offset);
+  out->set_lod(out_lod);
+  out->Resize({src_cap_l, pad_cap_e});
+
+  const auto* pad_data = pad->template data<T>();
+  auto* out_data = out->template mutable_data<T>();
+  for (int i = 0; i < src_batch; ++i) {
+    const int src_i_l = src_offset[i + 1] - src_offset[i];
+    const int pad_i_l = pad_offset[i + 1] - pad_offset[i];
+    if (pad_i_l < src_i_l) {
+      LOG(FATAL)
+          << "the length of padding seq input is less than source seq input.";
+    }
+    memcpy(out_data + src_offset[i] * pad_cap_e,
+           pad_data + pad_offset[i] * pad_cap_e,
+           src_i_l * pad_cap_e * sizeof(T));
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    search_seq_depadding,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::SearchSeqDepaddingCompute<float>,
+    def)
+    .BindInput("Pad", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Src", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/search_seq_depadding_compute.h b/lite/kernels/loongarch/search_seq_depadding_compute.h
new file mode 100644
index 00000000000..dd3265b4f34
--- /dev/null
+++ b/lite/kernels/loongarch/search_seq_depadding_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SearchSeqDepaddingCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchSeqDepaddingParam;
+
+  void Run() override;
+
+  virtual ~SearchSeqDepaddingCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/search_seq_depadding_compute_test.cc b/lite/kernels/loongarch/search_seq_depadding_compute_test.cc
new file mode 100644
index 00000000000..ff7f15d45c9
--- /dev/null
+++ b/lite/kernels/loongarch/search_seq_depadding_compute_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/search_seq_depadding_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(search_seq_depadding_loongarch, retrive_op) {
+  auto kernel = KernelRegistry::Global().Create("search_seq_depadding");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(search_seq_depadding_loongarch, init) {
+  SearchSeqDepaddingCompute<float> ssdc;
+  ASSERT_EQ(ssdc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(ssdc.target(), TARGET(kLoongArch));
+}
+
+TEST(search_seq_depadding_loongarch, run_test) {
+  lite::Tensor pad, src, out;
+  pad.Resize({2 * 3, 4});
+  src.Resize({3, 1});
+  out.Resize({3, 4});
+  LoD pad_lod{};
+  pad_lod.push_back({0, 4, 6});
+  pad.set_lod(pad_lod);
+  LoD src_lod{};
+  src_lod.push_back({0, 2, 3});
+  src.set_lod(src_lod);
+
+  auto* pad_data = pad.mutable_data<float>();
+  for (int64_t i = 0; i < pad.dims().production(); i++) {
+    pad_data[i] = static_cast<float>(i);
+  }
+  SearchSeqDepaddingCompute<float> ssdc;
+  operators::SearchSeqDepaddingParam param;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  ssdc.SetContext(std::move(ctx));
+
+  param.pad = &pad;
+  param.src = &src;
+  param.out = &out;
+
+  ssdc.SetParam(param);
+  ssdc.Run();
+
+  std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  auto* out_data = out.mutable_data<float>();
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_seq_depadding, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/search_seq_fc_compute.cc b/lite/kernels/loongarch/search_seq_fc_compute.cc
new file mode 100644
index 00000000000..1061be1e42b
--- /dev/null
+++ b/lite/kernels/loongarch/search_seq_fc_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/search_seq_fc_compute.h"
+
+REGISTER_LITE_KERNEL(search_seq_fc,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SearchSeqFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/search_seq_fc_compute.h b/lite/kernels/loongarch/search_seq_fc_compute.h
new file mode 100644
index 00000000000..af26155c99d
--- /dev/null
+++ b/lite/kernels/loongarch/search_seq_fc_compute.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SearchSeqFcCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchSeqFcParam;
+
+  void Run() override {
+    auto& context = ctx_->As<LoongArchContext>();
+    auto& param = *param_.get_mutable<operators::SearchSeqFcParam>();
+
+    auto x = param.x;
+    auto w = param.w;
+    auto b = param.b;
+    auto out = param.out;
+    auto out_size = param.out_size;
+    const auto x_dims = x->dims();
+    const auto w_dims = w->dims();
+    const auto out_dims = out->dims();
+    CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+    CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+    CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor.";
+    CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+    CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+    CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]";
+    CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size";
+
+    auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    blas.MatMul(*x, false, *w, true, out);
+
+    if (b != nullptr) {
+      auto b_dims = b->dims();
+      CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+      CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+      int M = x_dims[0];
+      int N = w_dims[0];
+      for (int i = 0; i < M; i++) {
+        blas.AXPY(N,
+                  static_cast<T>(1),
+                  b->template data<T>(),
+                  out->template mutable_data<T>() + i * N);
+      }
+    }
+  }
+
+  virtual ~SearchSeqFcCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/sequence_arithmetic_compute.cc b/lite/kernels/loongarch/sequence_arithmetic_compute.cc
new file mode 100644
index 00000000000..e1958f978ee
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_arithmetic_compute.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/sequence_arithmetic_compute.h"
+
+REGISTER_LITE_KERNEL(
+    sequence_arithmetic,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::SequenceArithmeticCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    search_seq_arithmetic,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::SequenceArithmeticCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/sequence_arithmetic_compute.h b/lite/kernels/loongarch/sequence_arithmetic_compute.h
new file mode 100644
index 00000000000..d9edfaef479
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_arithmetic_compute.h
@@ -0,0 +1,125 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <cstring>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SequenceArithmeticCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceArithmeticParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x = param.X;
+    auto y = param.Y;
+    auto out = param.Out;
+    int op_type = param.op_type;
+
+    CHECK(x->dims()[0]);
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+
+    auto x_data = x->template data<T>();
+    auto y_data = y->template data<T>();
+    auto out_data = out->template mutable_data<T>();
+    auto x_seq_offset = x->lod()[0];
+    auto y_seq_offset = y->lod()[0];
+    int seq_num = x_seq_offset.size() - 1;
+    int inner_size = (x->numel()) / (x->dims()[0]);
+
+    // sum
+    if (op_type == 1) {
+      for (int i = 0; i < seq_num; i++) {
+        CHECK_GT(x_seq_offset[i + 1], x_seq_offset[i]);
+        CHECK_GT(y_seq_offset[i + 1], y_seq_offset[i]);
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        CHECK_GT(len_x, 0);
+        CHECK_GT(len_y, 0);
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = (std::min)(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] + input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+
+    // sub
+    if (op_type == 2) {
+      for (int i = 0; i < seq_num; i++) {
+        CHECK_GT(x_seq_offset[i + 1], x_seq_offset[i]);
+        CHECK_GT(y_seq_offset[i + 1], y_seq_offset[i]);
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        CHECK_GT(len_x, 0);
+        CHECK_GT(len_y, 0);
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = (std::min)(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] - input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+
+    // mul
+    if (op_type == 3) {
+      for (int i = 0; i < seq_num; i++) {
+        CHECK_GT(x_seq_offset[i + 1], x_seq_offset[i]);
+        CHECK_GT(y_seq_offset[i + 1], y_seq_offset[i]);
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        CHECK_GT(len_x, 0);
+        CHECK_GT(len_y, 0);
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = (std::min)(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] * input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+  }
+
+  virtual ~SequenceArithmeticCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/sequence_arithmetic_compute_test.cc b/lite/kernels/loongarch/sequence_arithmetic_compute_test.cc
new file mode 100644
index 00000000000..8742dfe0903
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_arithmetic_compute_test.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/sequence_arithmetic_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void sequence_arithmetic_compute_ref(const Tensor& x,
+                                     const Tensor& y,
+                                     Tensor* out,
+                                     int op_type) {
+  auto x_data = x.data<float>();
+  auto y_data = y.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  auto x_seq_offset = x.lod()[0];
+  auto y_seq_offset = y.lod()[0];
+  int seq_num = x_seq_offset.size() - 1;
+  int inner_size = x.numel() / x.dims()[0];
+
+  for (int i = 0; i < seq_num; i++) {
+    int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+    int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+    auto input_x = x_data + x_seq_offset[i] * inner_size;
+    auto input_y = y_data + y_seq_offset[i] * inner_size;
+    auto t_out = out_data + x_seq_offset[i] * inner_size;
+    int len = std::min(len_x, len_y);
+    for (int j = 0; j < len; j++) {
+      switch (op_type) {
+        case 1:
+          t_out[j] = input_x[j] + input_y[j];
+          break;
+        case 2:
+          t_out[j] = input_x[j] - input_y[j];
+          break;
+        case 3:
+          t_out[j] = input_x[j] * input_y[j];
+          break;
+        default:
+          break;
+      }
+    }
+    if (len_x > len) {
+      memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len));
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& x_lod) {
+  x->Resize({static_cast<int64_t>(x_lod[0].back()), 3});
+  x->set_lod(x_lod);
+  auto x_data = x->mutable_data<float>();
+  for (int i = 0; i < x->numel(); i++) {
+    x_data[i] = (i - x->numel() / 2) * 1.1;
+  }
+}
+
+TEST(sequence_arithmetic_loongarch, retrive_op) {
+  auto sequence_arithmetic =
+      KernelRegistry::Global().Create("sequence_arithmetic");
+  ASSERT_FALSE(sequence_arithmetic.empty());
+  ASSERT_TRUE(sequence_arithmetic.front());
+}
+
+TEST(sequence_arithmetic_loongarch, init) {
+  SequenceArithmeticCompute<float> sequence_arithmetic;
+  ASSERT_EQ(sequence_arithmetic.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_arithmetic.target(), TARGET(kLoongArch));
+}
+
+TEST(sequence_arithmetic_loongarch, run_test) {
+  SequenceArithmeticCompute<float> sequence_arithmetic;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+
+  lite::Tensor x, y, out, out_ref;
+  lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}};
+  prepare_input(&x, x_lod);
+  prepare_input(&y, y_lod);
+
+  operators::SequenceArithmeticParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.op_type = 1;
+
+  sequence_arithmetic.SetContext(std::move(ctx));
+  sequence_arithmetic.SetParam(param);
+  sequence_arithmetic.Run();
+
+  sequence_arithmetic_compute_ref(x, y, &out_ref, param.op_type);
+  auto out_data = out.data<float>();
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-3);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_arithmetic, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/sequence_concat_compute.cc b/lite/kernels/loongarch/sequence_concat_compute.cc
new file mode 100644
index 00000000000..ca48f39daad
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_concat_compute.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/sequence_concat_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_concat,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SequenceConcatCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/sequence_concat_compute.h b/lite/kernels/loongarch/sequence_concat_compute.h
new file mode 100644
index 00000000000..f2296fb289e
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_concat_compute.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<uint64_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<T>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+template <typename T>
+class SequenceConcatCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConcatParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+
+    int64_t batch_size = 0;
+    int64_t feature_size = 0;
+    std::vector<int64_t> out_dims;
+    for (const auto& tensor : param.X) {
+      const auto x_dims = tensor->dims();
+      CHECK(x_dims[0]);
+      if (out_dims.empty()) {
+        out_dims = x_dims.Vectorize();
+      }
+      batch_size += x_dims[0];
+      if (feature_size == 0) {
+        feature_size = x_dims.production() / x_dims[0];
+      } else {
+        CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+            << "Inputs of sequence concat must have same feature size";
+      }
+    }
+    if (batch_size < 0) {
+      batch_size = -1;  // Normalize batch size for compile time.
+    }
+    out_dims[0] = batch_size;
+    param.Out->Resize(out_dims);
+
+    T* dout = param.Out->template mutable_data<T>();
+
+    std::vector<lite::Tensor> x_in_order;
+    param.Out->set_lod(ConcatLoD<T>(param.X, &x_in_order));
+
+    int num = x_in_order.size();
+    int out_rows = 1;
+
+    std::vector<int64_t> input_cols(num);
+    for (int i = 0; i < num; ++i) {
+      input_cols[i] = x_in_order[i].numel() / out_rows;
+    }
+
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = input_cols[j];
+      auto input_data = x_in_order[j].data<T>();
+      memcpy(dout + col_idx, input_data, sizeof(T) * col_len);
+      col_idx += col_len;
+    }
+  }
+
+  virtual ~SequenceConcatCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/sequence_concat_compute_test.cc b/lite/kernels/loongarch/sequence_concat_compute_test.cc
new file mode 100644
index 00000000000..7f204f8ce30
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_concat_compute_test.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/sequence_concat_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+namespace {
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<uint64_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<float>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+static void sequence_concat_ref(const std::vector<lite::Tensor*>& xs,
+                                lite::Tensor* out) {
+  std::vector<int64_t> out_dims;
+  int64_t batch_size = 0;
+  int64_t feature_size = 0;
+  for (const auto& tensor : xs) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.Vectorize();
+    }
+    batch_size += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  out_dims[0] = batch_size;
+  out->Resize(out_dims);
+  std::vector<lite::Tensor> x_in_order;
+  out->set_lod(ConcatLoD(xs, &x_in_order));
+
+  int num = x_in_order.size();
+  std::vector<int64_t> input_cols(num);
+  for (int i = 0; i < num; ++i) {
+    input_cols[i] = x_in_order[i].numel();
+  }
+  float* out_data = out->mutable_data<float>();
+  int col_idx = 0;
+  for (int j = 0; j < num; ++j) {
+    int col_len = input_cols[j];
+    auto input_data = x_in_order[j].data<float>();
+    memcpy(out_data + col_idx, input_data, sizeof(float) * col_len);
+    col_idx += col_len;
+  }
+}
+
+#define PREPARE_INPUT(name)                        \
+  name.Resize({name##_lod_len, feature_len});      \
+  name.set_lod(lod_info_##name);                   \
+  float* name##_data = name.mutable_data<float>(); \
+  for (int i = 0; i < name.numel(); ++i) {         \
+    name##_data[i] = (i - 2.0) * 1.0;              \
+  }
+
+}  // namespace
+
+TEST(sequence_concat_loongarch, retrive_op) {
+  auto sequence_concat = KernelRegistry::Global().Create("sequence_concat");
+  ASSERT_FALSE(sequence_concat.empty());
+  ASSERT_TRUE(sequence_concat.front());
+}
+
+TEST(sequence_concat_loongarch, init) {
+  SequenceConcatCompute<float> sequence_concat;
+  ASSERT_EQ(sequence_concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_concat.target(), TARGET(kLoongArch));
+}
+
+TEST(sequence_concat_loongarch, run_test) {
+  SequenceConcatCompute<float> seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+
+  operators::SequenceConcatParam param;
+  lite::Tensor x1, x2, x3;
+  lite::Tensor y, y_ref;
+
+  int32_t x1_lod_len = 10, feature_len = 4;
+  int32_t x2_lod_len = 4, x3_lod_len = 8;
+  int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len;
+  LoD lod_info_x1{{0, 3, 5, 6, 10}};
+  LoD lod_info_x2{{0, 1, 2, 3, 4}};
+  LoD lod_info_x3{{0, 2, 4, 6, 8}};
+  LoD lod_info_y{{0, 0, 0, 0, 0}};
+  for (size_t i = 0; i < lod_info_x1[0].size(); ++i) {
+    lod_info_y[0][i] =
+        lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i];
+  }
+
+  PREPARE_INPUT(x1);
+  PREPARE_INPUT(x2);
+  PREPARE_INPUT(x3);
+
+  y_ref.Resize({y_lod_len, feature_len});
+  y.Resize({y_lod_len, feature_len});
+  y_ref.set_lod(lod_info_y);
+  y.set_lod(lod_info_y);
+
+  std::vector<lite::Tensor*> xs{&x1, &x2, &x3};
+
+  param.X = xs;
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+
+  auto* y_data = y.mutable_data<float>();
+  sequence_concat_ref(xs, &y_ref);
+  float* y_ref_data = y_ref.mutable_data<float>();
+
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_concat, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/sequence_conv_compute.cc b/lite/kernels/loongarch/sequence_conv_compute.cc
new file mode 100644
index 00000000000..7d62ba6d339
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_conv_compute.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/sequence_conv_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_conv,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SequenceConvCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/sequence_conv_compute.h b/lite/kernels/loongarch/sequence_conv_compute.h
new file mode 100644
index 00000000000..700f0e1f61d
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_conv_compute.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include <vector>
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/backends/loongarch/math/context_project.h"
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+namespace math = paddle::lite::loongarch::math;
+
+template <typename T>
+class SequenceConvCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConvParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto& ctx = this->ctx_->template As<LoongArchContext>();
+
+    auto* in = param.X;
+    auto* filter = param.Filter;
+    auto* out = param.Out;
+    out->template mutable_data<T>();
+    CHECK(in->lod().size() == 1) << "Only support one level sequence now";
+
+    int context_start = param.contextStart;
+    int context_stride = param.contextStride;
+    int context_length = param.contextLength;
+    bool padding_trainable = false;
+    const Tensor* padding_data = nullptr;
+
+    int up_pad = (std::max)(0, -context_start);
+    int down_pad = (std::max)(0, context_start + context_length - 1);
+    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
+
+    std::vector<int64_t> col_shape{in->dims()[0],
+                                   context_length * sequence_width};
+    Tensor col;
+    col.Resize(col_shape);
+    col.mutable_data<T>();
+
+    // Because if padding_trainable is false, padding data should be zeros.
+    math::SetConstant<TARGET(kLoongArch), T> set_zero;
+    auto blas = math::GetBlas<TARGET(kLoongArch), T>(ctx);
+    set_zero(ctx, &col, static_cast<T>(0));
+    math::ContextProjectFunctor<TARGET(kLoongArch), T> seq_project_functor;
+
+    seq_project_functor(ctx,
+                        *in,
+                        padding_data,
+                        padding_trainable,
+                        context_start,
+                        context_length,
+                        context_stride,
+                        up_pad,
+                        down_pad,
+                        &col);
+
+    blas.MatMul(col, *filter, out);
+  }
+
+  virtual ~SequenceConvCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/sequence_expand_as_compute.cc b/lite/kernels/loongarch/sequence_expand_as_compute.cc
new file mode 100644
index 00000000000..d3e9472fdbd
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_expand_as_compute.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/sequence_expand_as_compute.h"
+
+using sequence_expand_as_float32 =
+    paddle::lite::kernels::loongarch::SequenceExpandAsCompute<float,
+                                                        PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(
+    sequence_expand_as, kLoongArch, kFloat, kNCHW, sequence_expand_as_float32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+using sequence_expand_as_int32 =
+    paddle::lite::kernels::loongarch::SequenceExpandAsCompute<int32_t,
+                                                        PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(
+    sequence_expand_as, kLoongArch, kFloat, kNCHW, sequence_expand_as_int32, int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+using sequence_expand_as_int64 =
+    paddle::lite::kernels::loongarch::SequenceExpandAsCompute<int64_t,
+                                                        PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(
+    sequence_expand_as, kLoongArch, kFloat, kNCHW, sequence_expand_as_int64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/sequence_expand_as_compute.h b/lite/kernels/loongarch/sequence_expand_as_compute.h
new file mode 100644
index 00000000000..261f2d6c42f
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_expand_as_compute.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+using Tensor = lite::Tensor;
+
+template <typename T>
+struct SequenceExpandFunctor {
+  void operator()(
+      const Tensor &x,
+      const std::vector<uint64_t> &ref_lod, /*expand referenced lod*/
+      Tensor *out) {
+    int64_t hight = x.dims()[0];
+    int64_t width = x.data_size() / hight;
+
+    const T *in_data = x.data<T>();
+    T *out_data = out->mutable_data<T, T>();
+
+    for (int h_id = 0; h_id < hight; ++h_id) {
+      uint64_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      if (span == 0) continue;
+      const T *src = in_data + h_id * width;
+      for (uint64_t w_id = 0; w_id < width; ++w_id) {
+        T ele = src[w_id];
+        size_t offset = ref_lod[h_id] * width;
+        for (uint64_t k = 0; k < span; ++k) {
+          out_data[offset + k * width + w_id] = ele;
+        }
+      }
+    }
+  }
+};
+
+template <typename T, PrecisionType PType>
+class SequenceExpandAsCompute : public KernelLite<TARGET(kLoongArch), PType> {
+ public:
+  void Run() override {
+    auto &param = this->template Param<operators::SequenceExpandAsParam>();
+
+    auto *x = param.x;
+    auto *y = param.y;
+    auto *out = param.out;
+
+    auto &y_lod = y->lod();
+    CHECK_EQ(y_lod.size(), 1u);
+    CHECK_GT(y_lod[0].size(), 1u);
+
+    out->template mutable_data<T, T>();
+
+    SequenceExpandFunctor<T> seq_espand_functor;
+    seq_espand_functor(*x, y_lod[0], out);
+  }
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/sequence_expand_as_compute_test.cc b/lite/kernels/loongarch/sequence_expand_as_compute_test.cc
new file mode 100644
index 00000000000..f6ce79a28c5
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_expand_as_compute_test.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/sequence_expand_as_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(sequence_expand_as_loongarch, retrive_op) {
+  auto sequence_expand_as =
+      KernelRegistry::Global().Create("sequence_expand_as");
+  ASSERT_FALSE(sequence_expand_as.empty());
+  ASSERT_TRUE(sequence_expand_as.front());
+}
+
+TEST(sequence_expand_as_loongarch, init) {
+  SequenceExpandAsCompute<float, PRECISION(kFloat)> sequence_expand_as;
+  ASSERT_EQ(sequence_expand_as.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_expand_as.target(), TARGET(kLoongArch));
+}
+
+TEST(sequence_expand_as_loongarch, run_test) {
+  lite::Tensor x, y, out;
+  std::vector<int64_t> x_shape{4, 1};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{1, 5};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{8, 1};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+
+  std::vector<std::vector<uint64_t>> lod{{0, 3, 6, 7, 8}};
+  y.set_lod(lod);
+  // MulCompute mul;
+  SequenceExpandAsCompute<float, PRECISION(kFloat)> sequence_expand_as;
+  operators::SequenceExpandAsParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  sequence_expand_as.SetContext(std::move(ctx));
+  sequence_expand_as.SetParam(param);
+  sequence_expand_as.Run();
+  auto out_data = out.mutable_data<float>();
+
+  int index = 1;
+  int lod_sum = lod[0][index];
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+    if (i >= lod_sum) {
+      index++;
+      lod_sum = lod[0][index];
+    }
+    ASSERT_EQ(out_data[i], x_data[index - 1]);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_expand_as, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/sequence_pool_compute.cc b/lite/kernels/loongarch/sequence_pool_compute.cc
new file mode 100644
index 00000000000..d528cda7452
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_pool_compute.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/sequence_pool_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_pool,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SequencePoolCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/sequence_pool_compute.h b/lite/kernels/loongarch/sequence_pool_compute.h
new file mode 100644
index 00000000000..01598482322
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_pool_compute.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/backends/loongarch/math/sequence_pooling.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SequencePoolCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequencePoolParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SequencePoolParam>();
+    auto& context = ctx_->As<LoongArchContext>();
+    auto* out = param.Out;
+    auto dims = param.X->dims();
+    auto lod = param.X->lod();
+    auto* index = param.MaxIndex;
+    CHECK_LE(lod.size(), 2UL);
+    CHECK_GE(dims[0], static_cast<int64_t>(lod[lod.size() - 1].size() - 1));
+
+    dims[0] = lod[lod.size() - 1].size() - 1;
+    out->Resize({dims});
+    out->template mutable_data<T>();
+
+    const bool is_test = true;
+    float pad_value = param.pad_value;
+
+    lite::loongarch::math::SequencePoolFunctor<lite::TargetType::kLoongArch, T> pool;
+    pool(context, param.pool_type, pad_value, *param.X, out, is_test, index);
+
+    int batch_size = lod.size() - 1;
+    std::vector<uint64_t> offset_new;
+    if (param.X->lod().size() == 2) {
+      offset_new.resize(param.X->lod()[0].size());
+      offset_new = param.X->lod()[0];
+    } else {
+      offset_new.resize(batch_size + 1);
+      for (int i = 0; i <= batch_size; i++) {
+        offset_new[i] = i;
+      }
+    }
+
+    out->mutable_lod()->clear();
+    out->mutable_lod()->push_back(offset_new);
+  }
+
+  virtual ~SequencePoolCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/sequence_pool_compute_test.cc b/lite/kernels/loongarch/sequence_pool_compute_test.cc
new file mode 100644
index 00000000000..0f3c4c64ed2
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_pool_compute_test.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/sequence_pool_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(sequence_pool_loongarch, retrive_op) {
+  auto sequence_pool = KernelRegistry::Global().Create("sequence_pool");
+  ASSERT_FALSE(sequence_pool.empty());
+  ASSERT_TRUE(sequence_pool.front());
+}
+
+TEST(sequence_pool_loongarch, init) {
+  SequencePoolCompute<float> sequence_pool;
+  ASSERT_EQ(sequence_pool.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_pool.target(), TARGET(kLoongArch));
+}
+
+TEST(sequence_pool_loongarch, run_test) {
+  lite::Tensor x, out;
+  lite::LoD lod;
+  lod.push_back(std::vector<uint64_t>{0, 10});
+
+  x.set_lod(lod);
+  const size_t second_dim = 8u;
+  std::vector<int64_t> input_shape{static_cast<int64_t>(lod[0].back()),
+                                   static_cast<int64_t>(second_dim)};
+  lite::DDim in_dims(input_shape);
+  x.Resize(in_dims);
+
+  const size_t out_first_dim = lod[0].size() - 1;
+  std::vector<int64_t> output_shape{static_cast<int64_t>(out_first_dim),
+                                    static_cast<int64_t>(second_dim)};
+  lite::DDim out_dims(output_shape);
+  out.Resize(out_dims);
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = 1.1f * i;
+  }
+
+  SequencePoolCompute<float> sequence_pool;
+  operators::SequencePoolParam param;
+  param.X = &x;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  sequence_pool.SetContext(std::move(ctx));
+  sequence_pool.SetParam(param);
+  sequence_pool.Run();
+
+  std::vector<float> ref_results = {
+      39.6f, 40.7f, 41.8f, 42.9f, 44.f, 45.1f, 46.2f, 47.3f};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_pool, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/sequence_reshape_compute.cc b/lite/kernels/loongarch/sequence_reshape_compute.cc
new file mode 100644
index 00000000000..91fd32db290
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_reshape_compute.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/sequence_reshape_compute.h"
+
+REGISTER_LITE_KERNEL(
+    sequence_reshape,
+    kLoongArch,
+    kInt64,
+    kNCHW,
+    paddle::lite::kernels::loongarch::SequenceReshapeCompute<int64_t>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sequence_reshape,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::SequenceReshapeFloatCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/sequence_reshape_compute.h b/lite/kernels/loongarch/sequence_reshape_compute.h
new file mode 100644
index 00000000000..200de023525
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_reshape_compute.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SequenceReshapeCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kInt64)> {
+ public:
+  using param_t = operators::SequenceReshapeParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SequenceReshapeParam>();
+    // auto& context = context_->As<LoongArchContext>();
+    auto* in = param.x;
+    auto* out = param.output;
+    int out_width = param.new_dim;
+
+    const auto& in_dims = in->dims();
+    int64_t in_width = in_dims[1];
+
+    auto& in_lod = in->lod();
+    CHECK_EQ(in_lod.size(), 1UL);
+    CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
+
+    auto in_lod_l0 = in_lod[0];
+    int seq_num = in_lod_l0.size() - 1;
+
+    if (in_width == out_width) {
+      out->set_lod(in->lod());
+    } else {
+      auto& out_lod = *out->mutable_lod();
+      out_lod.resize(1);
+      out_lod[0].resize(seq_num + 1);
+      out_lod[0][0] = 0;
+      for (int i = 0; i < seq_num; ++i) {
+        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
+        size_t offset = 0;
+        offset = (seq_len * in_width) / out_width;
+        CHECK_EQ(offset * out_width, seq_len * in_width);
+        out_lod[0][i + 1] = out_lod[0][i] + offset;
+      }
+    }
+
+    out->Resize(std::vector<int64_t>{in->numel() / out_width, out_width});
+    auto* dst_ptr = out->template mutable_data<T>();
+    auto size = in->numel() * sizeof(T);
+    std::memcpy(dst_ptr, in->template data<T>(), size);
+  }
+
+  virtual ~SequenceReshapeCompute() = default;
+};
+
+template <typename T>
+class SequenceReshapeFloatCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceReshapeParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SequenceReshapeParam>();
+    auto* in = param.x;
+    auto* out = param.output;
+    auto out_data = out->template mutable_data<T>();
+    for (int i = 0; i < out->dims().production(); i++) {
+      out_data[i] = 0;
+    }
+    int out_width = param.new_dim;
+    const auto& in_dims = in->dims();
+    int64_t in_width = in_dims[1];
+    auto& in_lod = in->lod();
+    CHECK_EQ(in_lod.size(), 1UL);
+    CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
+    auto in_lod_l0 = in_lod[0];
+    int seq_num = in_lod_l0.size() - 1;
+    if (in_width == out_width) {
+      out->set_lod(in->lod());
+    } else {
+      auto& out_lod = *out->mutable_lod();
+      out_lod.resize(1);
+      out_lod[0].resize(seq_num + 1);
+      out_lod[0][0] = 0;
+      for (int i = 0; i < seq_num; ++i) {
+        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
+        size_t offset = 0;
+        offset = (seq_len * in_width) / out_width;
+        CHECK_EQ(offset * out_width, seq_len * in_width);
+        out_lod[0][i + 1] = out_lod[0][i] + offset;
+      }
+    }
+    out->Resize(std::vector<int64_t>{in->numel() / out_width, out_width});
+    auto* dst_ptr = out->template mutable_data<T>();
+    auto size = in->numel() * sizeof(T);
+    std::memcpy(dst_ptr, in->template data<T>(), size);
+  }
+
+  virtual ~SequenceReshapeFloatCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/sequence_reverse_compute.cc b/lite/kernels/loongarch/sequence_reverse_compute.cc
new file mode 100644
index 00000000000..8eb4a656577
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_reverse_compute.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/sequence_reverse_compute.h"
+
+typedef paddle::lite::kernels::loongarch::SequenceReverseCompute<float> ReverseFp32;
+typedef paddle::lite::kernels::loongarch::SequenceReverseCompute<int> ReverseInt32;
+typedef paddle::lite::kernels::loongarch::SequenceReverseCompute<int64_t>
+    ReverseInt64;
+
+REGISTER_LITE_KERNEL(sequence_reverse, kLoongArch, kFloat, kNCHW, ReverseFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_reverse, kLoongArch, kFloat, kNCHW, ReverseInt32, int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_reverse, kLoongArch, kFloat, kNCHW, ReverseInt64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/sequence_reverse_compute.h b/lite/kernels/loongarch/sequence_reverse_compute.h
new file mode 100644
index 00000000000..96990892b40
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_reverse_compute.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SequenceReverseCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceReverseParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto* output = param.Out;
+    const auto* din = param.X->template data<T>();
+
+    T* dout = output->template mutable_data<T>();
+    CHECK_NE(din, dout)
+        << "SequenceReverse Op does not support in-place operation";
+    const auto lod = param.X->lod()[param.X->lod().size() - 1];
+    const size_t lod_count = lod.size();
+
+    size_t limit = static_cast<size_t>(param.X->numel());
+    size_t row_numel = static_cast<size_t>(limit / param.X->dims()[0]);
+
+    for (size_t idx = 0; idx < lod_count - 1; ++idx) {
+      auto start_pos = lod[idx];
+      auto end_pos = lod[idx + 1];
+      for (auto pos = start_pos; pos < end_pos; ++pos) {
+        auto cur_pos = end_pos - pos - 1 + start_pos;
+        std::memcpy(dout + pos * row_numel,
+                    din + cur_pos * row_numel,
+                    row_numel * sizeof(T));
+      }
+    }
+    output->set_lod(param.X->lod());
+  }
+
+  virtual ~SequenceReverseCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/sequence_topk_avg_pooling_compute.cc b/lite/kernels/loongarch/sequence_topk_avg_pooling_compute.cc
new file mode 100644
index 00000000000..d03eb952e7d
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_topk_avg_pooling_compute.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/sequence_topk_avg_pooling_compute.h"
+
+REGISTER_LITE_KERNEL(
+    sequence_topk_avg_pooling,
+    kLoongArch,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::loongarch::SequenceTopkAvgPoolingCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/sequence_topk_avg_pooling_compute.h b/lite/kernels/loongarch/sequence_topk_avg_pooling_compute.h
new file mode 100644
index 00000000000..0dc66c818aa
--- /dev/null
+++ b/lite/kernels/loongarch/sequence_topk_avg_pooling_compute.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/loongarch/math/sequence_topk_avg_pooling.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class SequenceTopkAvgPoolingCompute
+    : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceTopkAvgPoolingParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    lite::loongarch::math::SequenceTopkAvgPoolingFunctor<lite::TargetType::kLoongArch, T>
+        sequence_topk_avg_pooling;
+    sequence_topk_avg_pooling(*param.X,
+                              *param.ROW,
+                              *param.COLUMN,
+                              param.Out,
+                              param.pos,
+                              param.channel_num,
+                              param.topks);
+  };
+  virtual ~SequenceTopkAvgPoolingCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/set_value_compute.cc b/lite/kernels/loongarch/set_value_compute.cc
new file mode 100644
index 00000000000..47f376d073e
--- /dev/null
+++ b/lite/kernels/loongarch/set_value_compute.cc
@@ -0,0 +1,235 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/set_value_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename D>
+void SetValueCompute<D>::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+#define SET_VALUE_WITH_TENSOR(__starts__, __ends__, __steps__) \
+  if (param.ValueTensor != nullptr) {                          \
+    SetTensorValueKernel<D>(param.Input,                       \
+                            param.ValueTensor,                 \
+                            __starts__,                        \
+                            __ends__,                          \
+                            __steps__,                         \
+                            param.axes,                        \
+                            param.decrease_axes,               \
+                            param.none_axes,                   \
+                            param.Out);                        \
+    return;                                                    \
+  }
+
+#define SET_VALUE(__precision__, __starts__, __ends__, __steps__, __values__) \
+  if (!__values__.empty()) {                                                  \
+    SetValue<__precision__>(param.Input,                                      \
+                            __starts__,                                       \
+                            __ends__,                                         \
+                            __steps__,                                        \
+                            param.axes,                                       \
+                            param.decrease_axes,                              \
+                            param.none_axes,                                  \
+                            param.shape,                                      \
+                            __values__,                                       \
+                            param.Out);                                       \
+    return;                                                                   \
+  }
+
+  if (param.StartsTensorList.size() > 0) {
+    auto starts = GetDataFromTensorList(param.StartsTensorList);
+    if (param.EndsTensorList.size() > 0) {
+      auto ends = GetDataFromTensorList(param.EndsTensorList);
+      if (param.StepsTensorList.size() > 0) {
+        auto steps = GetDataFromTensorList(param.StepsTensorList);
+        SET_VALUE_WITH_TENSOR(starts, ends, steps)
+        SET_VALUE(float, starts, ends, steps, param.fp32_values)
+        SET_VALUE(double, starts, ends, steps, param.fp64_values)
+        SET_VALUE(int, starts, ends, steps, param.int32_values)
+        SET_VALUE(int64_t, starts, ends, steps, param.int64_values)
+        SET_VALUE(int, starts, ends, steps, param.bool_values)
+      } else {
+        SET_VALUE_WITH_TENSOR(starts, ends, param.steps)
+        SET_VALUE(float, starts, ends, param.steps, param.fp32_values)
+        SET_VALUE(double, starts, ends, param.steps, param.fp64_values)
+        SET_VALUE(int, starts, ends, param.steps, param.int32_values)
+        SET_VALUE(int64_t, starts, ends, param.steps, param.int64_values)
+        SET_VALUE(int, starts, ends, param.steps, param.bool_values)
+      }
+    } else {
+      if (param.StepsTensorList.size() > 0) {
+        auto steps = GetDataFromTensorList(param.StepsTensorList);
+        SET_VALUE_WITH_TENSOR(starts, param.ends, steps)
+        SET_VALUE(float, starts, param.ends, steps, param.fp32_values)
+        SET_VALUE(double, starts, param.ends, steps, param.fp64_values)
+        SET_VALUE(int, starts, param.ends, steps, param.int32_values)
+        SET_VALUE(int64_t, starts, param.ends, steps, param.int64_values)
+        SET_VALUE(int, starts, param.ends, steps, param.bool_values)
+      } else {
+        SET_VALUE_WITH_TENSOR(starts, param.ends, param.steps)
+        SET_VALUE(float, starts, param.ends, param.steps, param.fp32_values)
+        SET_VALUE(double, starts, param.ends, param.steps, param.fp64_values)
+        SET_VALUE(int, starts, param.ends, param.steps, param.int32_values)
+        SET_VALUE(int64_t, starts, param.ends, param.steps, param.int64_values)
+        SET_VALUE(int, starts, param.ends, param.steps, param.bool_values)
+      }
+    }
+  } else {
+    if (param.EndsTensorList.size() > 0) {
+      auto ends = GetDataFromTensorList(param.EndsTensorList);
+      if (param.StepsTensorList.size() > 0) {
+        auto steps = GetDataFromTensorList(param.StepsTensorList);
+        SET_VALUE_WITH_TENSOR(param.starts, ends, steps)
+        SET_VALUE(float, param.starts, ends, steps, param.fp32_values)
+        SET_VALUE(double, param.starts, ends, steps, param.fp64_values)
+        SET_VALUE(int, param.starts, ends, steps, param.int32_values)
+        SET_VALUE(int64_t, param.starts, ends, steps, param.int64_values)
+        SET_VALUE(int, param.starts, ends, steps, param.bool_values)
+      } else {
+        SET_VALUE_WITH_TENSOR(param.starts, ends, param.steps)
+        SET_VALUE(float, param.starts, ends, param.steps, param.fp32_values)
+        SET_VALUE(double, param.starts, ends, param.steps, param.fp64_values)
+        SET_VALUE(int, param.starts, ends, param.steps, param.int32_values)
+        SET_VALUE(int64_t, param.starts, ends, param.steps, param.int64_values)
+        SET_VALUE(int, param.starts, ends, param.steps, param.bool_values)
+      }
+    } else {
+      if (param.StepsTensorList.size() > 0) {
+        auto steps = GetDataFromTensorList(param.StepsTensorList);
+        SET_VALUE_WITH_TENSOR(param.starts, param.ends, steps)
+        SET_VALUE(float, param.starts, param.ends, steps, param.fp32_values)
+        SET_VALUE(double, param.starts, param.ends, steps, param.fp64_values)
+        SET_VALUE(int, param.starts, param.ends, steps, param.int32_values)
+        SET_VALUE(int64_t, param.starts, param.ends, steps, param.int64_values)
+        SET_VALUE(int, param.starts, param.ends, steps, param.bool_values)
+      } else {
+        SET_VALUE_WITH_TENSOR(param.starts, param.ends, param.steps)
+        SET_VALUE(
+            float, param.starts, param.ends, param.steps, param.fp32_values)
+        SET_VALUE(
+            double, param.starts, param.ends, param.steps, param.fp64_values)
+        SET_VALUE(
+            int, param.starts, param.ends, param.steps, param.int32_values)
+        SET_VALUE(
+            int64_t, param.starts, param.ends, param.steps, param.int64_values)
+        SET_VALUE(int, param.starts, param.ends, param.steps, param.bool_values)
+      }
+    }
+  }
+#undef SET_VALUE_WITH_TENSOR
+#undef SET_VALUE
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// float
+REGISTER_LITE_KERNEL(set_value,
+                     kLoongArch,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SetValueCompute<float>,
+                     fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("ValueTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StepsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(set_value,
+                     kLoongArch,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SetValueCompute<int>,
+                     int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("ValueTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StepsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(set_value,
+                     kLoongArch,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SetValueCompute<int64_t>,
+                     int64)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("ValueTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StepsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(set_value,
+                     kLoongArch,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SetValueCompute<int>,
+                     bool)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kBool))})
+    .BindInput("ValueTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StepsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kBool))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(set_value,
+                     kLoongArch,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SetValueCompute<double>,
+                     double)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFP64))})
+    .BindInput("ValueTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StepsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFP64))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/set_value_compute.h b/lite/kernels/loongarch/set_value_compute.h
new file mode 100644
index 00000000000..d8c0a32d4af
--- /dev/null
+++ b/lite/kernels/loongarch/set_value_compute.h
@@ -0,0 +1,398 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <Eigen/Core>
+#include <algorithm>
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/elementwise_op_function.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = lite::fluid::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename T>
+struct SubtractFunctor {
+  inline T operator()(const T a, const T b) const { return a - b; }
+};
+
+// check whether the tensor with dimension of second can assign to the
+// tensor with dimension of first
+inline void CheckIsDimsMatch(const DDim& first, const DDim& second) {
+  int ignore_axis1 = 0, ignore_axis2 = 0;
+  for (; ignore_axis1 < first.size(); ++ignore_axis1) {
+    if (first[ignore_axis1] != 1) {
+      break;
+    }
+  }
+  for (; ignore_axis2 < second.size(); ++ignore_axis2) {
+    if (second[ignore_axis2] != 1) {
+      break;
+    }
+  }
+
+  if (second.size() == ignore_axis2) {
+    // second tensor has only one value
+    return;
+  }
+
+  if (first.size() - ignore_axis1 >= second.size() - ignore_axis2) {
+    int idx1 = first.size() - 1;
+    int idx2 = second.size() - 1;
+    bool is_match = true;
+    for (; idx2 >= ignore_axis2; idx2--) {
+      if (first[idx1--] != second[idx2] && second[idx2] != 1) {
+        is_match = false;
+        break;
+      }
+    }
+    if (is_match) {
+      return;
+    }
+  }
+  LOG(FATAL) << "The shape of tensor assigned value must match the shape of "
+                "target shape: "
+             << second << ", but now shape is " << first << ".";
+}
+
+template <typename T = int64_t>
+void CheckAndUpdateSliceAttrs(const DDim in_dims,
+                              const std::vector<T>& axes,
+                              std::vector<T>* starts,
+                              std::vector<T>* ends,
+                              std::vector<int64_t>* steps = nullptr,
+                              std::vector<T>* infer_flags = nullptr) {
+  for (size_t i = 0; i < axes.size(); ++i) {
+    T axis = axes[i];
+    CHECK_LT(axis, in_dims.size()) << "The axis value should be less than "
+                                      "the rank of input, but received axes["
+                                   << i << "] = " << axis << "rank of input is "
+                                   << in_dims.size() << ".";
+
+    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+      continue;
+    }
+
+    T dim_value = in_dims[axis];
+
+    if (dim_value > 0) {
+      T step = steps == nullptr ? 1 : (*steps)[i];
+      CHECK_NE(step, 0) << "Step should not be 0, but received step = " << step
+                        << ".";
+      T start = (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
+      start = std::max(start, static_cast<T>(0));
+
+      T end =
+          0 < step && (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
+      end = std::min(end, dim_value);
+
+      if (step > 0) {
+        start = std::min(start, dim_value);
+        end = std::max(end, static_cast<T>(0));
+        CHECK_GE(end, start)
+            << "When step > 0, end should be greater than start, but "
+               "received end = "
+            << end << ", start = " << start << ".";
+      } else {
+        start = std::min(start, dim_value - 1);
+        if (end < -1) {
+          end += dim_value;
+        }
+        end = std::max(end, static_cast<T>(-1));
+        CHECK_GE(start, end)
+            << "When step < 0, start should be greater than end, but "
+               "received end = "
+            << end << ", start = " << start << ".";
+      }
+
+      (*starts)[i] = start;
+      (*ends)[i] = end;
+    } else if (dim_value == 0) {
+      (*starts)[i] = 0;
+      (*ends)[i] = 0;
+    }
+  }
+}
+
+template <typename T = int64_t>
+DDim GetSliceDims(const DDim in_dims,
+                  const std::vector<T>& axes,
+                  const std::vector<T>& starts,
+                  const std::vector<T>& ends,
+                  std::vector<T>* steps = nullptr,
+                  std::vector<T>* infer_flags = nullptr) {
+  DDim slice_dims(in_dims);
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    T axis = axes[i];
+    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+      slice_dims[axis] = -1;
+      continue;
+    }
+
+    if (in_dims[axis] == -1) {
+      continue;
+    }
+
+    T start = starts[i];
+    T end = ends[i];
+    T step = steps == nullptr ? 1 : (*steps)[i];
+
+    if (step > 0) {
+      slice_dims[axis] = (end - start + step - 1) / step;
+    } else {
+      slice_dims[axis] = (end - start + step + 1) / step;
+    }
+  }
+  return slice_dims;
+}
+
+template <typename T = int64_t>
+inline DDim GetDecreasedDims(const DDim slice_dims,
+                             const std::vector<T>& decrease_axes,
+                             std::vector<T>* infer_flags = nullptr) {
+  DDim decreased_dims(slice_dims);
+  std::vector<uint8_t> decrease_flag(slice_dims.size(), 0);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      T axis = decrease_axes[i];
+      decrease_flag[axis] = 1;
+      if (infer_flags && (*infer_flags)[i] != -1) {
+        CHECK_EQ(decreased_dims[axis], 1)
+            << "Decrease dim should be 1, but now received "
+            << decreased_dims[axis] << ".";
+      }
+    }
+
+    std::vector<T> new_shape;
+    for (int i = 0; i < decreased_dims.size(); ++i) {
+      if (decrease_flag[i] == 0) {
+        new_shape.push_back(decreased_dims[i]);
+      }
+    }
+
+    if (new_shape.size() == 0) {
+      new_shape.push_back(1);
+    }
+
+    decreased_dims = DDim(new_shape);
+  }
+  return decreased_dims;
+}
+
+static inline std::vector<int64_t> GetDataFromTensorList(
+    const std::vector<const lite::Tensor*>& tensor_list) {
+  // get tensor
+  std::vector<int64_t> vec_new_data;
+  for (size_t i = 0; i < tensor_list.size(); ++i) {
+    auto tensor = tensor_list[i];
+    CHECK_EQ(tensor->dims(), DDim({1})) << "shape of dim tensor should be [1]";
+    vec_new_data.push_back(static_cast<int64_t>(*tensor->data<int>()));
+  }
+  return vec_new_data;
+}
+
+template <typename D>
+class SetValueCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kAny)> {
+ public:
+  using param_t = operators::SetValueParam;
+
+  template <typename T, size_t RANK>
+  void SetValueImpl(const lite::Tensor* input,
+                    const lite::Tensor* value,
+                    std::vector<int64_t>& starts,
+                    std::vector<int64_t>& ends,
+                    std::vector<int64_t>& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    lite::Tensor* out) {
+    auto in_dims = input->dims();
+    CheckAndUpdateSliceAttrs<int64_t>(in_dims, axes, &starts, &ends, &steps);
+    auto slice_dims =
+        GetSliceDims<int64_t>(in_dims, axes, starts, ends, &steps);
+    auto decrease_slice_dims =
+        GetDecreasedDims<int64_t>(slice_dims, decrease_axes);
+    auto slice_dims_for_assign = decrease_slice_dims;
+
+    if (!none_axes.empty()) {
+      std::vector<int64_t> slice_dims_with_none;
+
+      size_t none_axes_cur = 0, decrease_axes_cur = 0;
+      for (int i = 0; i < slice_dims.size(); ++i) {
+        while (none_axes_cur < none_axes.size() &&
+               none_axes[none_axes_cur] <= i) {
+          slice_dims_with_none.push_back(1);
+          none_axes_cur++;
+        }
+        if (decrease_axes_cur < decrease_axes.size() &&
+            decrease_axes[decrease_axes_cur] == i) {
+          decrease_axes_cur++;
+        } else {
+          slice_dims_with_none.push_back(slice_dims[i]);
+        }
+      }
+      while (none_axes_cur < none_axes.size()) {
+        slice_dims_with_none.push_back(1);
+        none_axes_cur++;
+      }
+      slice_dims_for_assign = DDim(slice_dims_with_none);
+    }
+
+    auto eigen_place = lite::fluid::EigenDeviceType<TARGET(kLoongArch)>();
+    out->Resize(in_dims);
+    out->CopyDataFrom(*input);
+
+    lite::Tensor slice_tensor;
+    lite::Tensor pad_tensor;
+    slice_tensor.Resize(slice_dims);
+    slice_tensor.mutable_data<T>();
+    pad_tensor.Resize(in_dims);
+    pad_tensor.mutable_data<T>();
+
+    auto pad_e = EigenTensor<T, RANK>::From(pad_tensor, in_dims);
+    auto out_e = EigenTensor<T, RANK>::From(*out);
+    auto slice_e = EigenTensor<T, RANK>::From(slice_tensor, slice_dims);
+    // Step 1: Set the value of out at `_index` to zero
+    slice_e.device(eigen_place) = slice_e.constant(T(0));
+
+    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+    auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+
+    for (size_t i = 0; i < RANK; ++i) {
+      starts_indices[i] = 0;
+      ends_indices[i] = slice_dims[i];
+      strides_indices[i] = 1;
+    }
+    for (size_t i = 0; i < axes.size(); i++) {
+      int axis_index = axes[i];
+      starts_indices[axis_index] = starts[i];
+      ends_indices[axis_index] = ends[i];
+      strides_indices[axis_index] = steps[i];
+      if (starts[i] == ends[i]) {  // slice is empty, data will not be changed
+        return;
+      }
+    }
+
+    out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+        .device(eigen_place) = slice_e;
+
+    // Step 2: Set a tensor with the same shape as out tensor. And its data at
+    // '_index' is the same as value, and data out of '_index' to zero
+    slice_tensor.Resize(slice_dims_for_assign);
+    CheckIsDimsMatch(slice_dims_for_assign, value->dims());
+    // ElementwiseComputeEx can do broadcasting
+    ElementwiseComputeEx<SubtractFunctor<T>, lite::TargetType::kLoongArch, T, T>(
+        ctx_->As<LoongArchContext>(),
+        &slice_tensor,
+        value,
+        -1,
+        SubtractFunctor<T>(),
+        &slice_tensor);
+    slice_tensor.Resize(slice_dims);
+    // - Step 2.2 Pad slice tensor with 0
+    pad_e.device(eigen_place) = pad_e.constant(T(0));
+    pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+        .device(eigen_place) = slice_e;
+    // Step 3: Set out tensor with value
+    out->mutable_data<T>();
+    out_e.device(eigen_place) = out_e - pad_e;
+  }
+
+  template <typename T>
+  void SetValue(const lite::Tensor* input,
+                std::vector<int64_t>& starts,
+                std::vector<int64_t>& ends,
+                std::vector<int64_t>& steps,
+                const std::vector<int64_t>& axes,
+                const std::vector<int64_t>& decrease_axes,
+                const std::vector<int64_t>& none_axes,
+                const std::vector<int64_t>& shape,
+                const std::vector<T>& values,
+                lite::Tensor* out) {
+    lite::Tensor value_tensor;
+    value_tensor.Resize(shape);
+    T* value_tensor_data = value_tensor.mutable_data<T>();
+    std::memcpy(static_cast<void*>(value_tensor_data),
+                static_cast<const void*>(values.data()),
+                sizeof(T) * values.size());
+    SetTensorValueKernel<T>(input,
+                            &value_tensor,
+                            starts,
+                            ends,
+                            steps,
+                            axes,
+                            decrease_axes,
+                            none_axes,
+                            out);
+  }
+
+  template <typename T>
+  void SetTensorValueKernel(const lite::Tensor* input,
+                            const lite::Tensor* value,
+                            std::vector<int64_t>& starts,
+                            std::vector<int64_t>& ends,
+                            std::vector<int64_t>& steps,
+                            const std::vector<int64_t>& axes,
+                            const std::vector<int64_t>& decrease_axes,
+                            const std::vector<int64_t>& none_axes,
+                            lite::Tensor* out) {
+    const int rank = input->dims().size();
+    switch (rank) {
+#define SET_VALUE_IMPL(rank)             \
+  case rank: {                           \
+    SetValueImpl<T, rank>(input,         \
+                          value,         \
+                          starts,        \
+                          ends,          \
+                          steps,         \
+                          axes,          \
+                          decrease_axes, \
+                          none_axes,     \
+                          out);          \
+    break;                               \
+  }
+      SET_VALUE_IMPL(1)
+      SET_VALUE_IMPL(2)
+      SET_VALUE_IMPL(3)
+      SET_VALUE_IMPL(4)
+      SET_VALUE_IMPL(5)
+      SET_VALUE_IMPL(6)
+      default:
+        LOG(FATAL) << "The rank of input should be less than 7, but received "
+                   << rank;
+        break;
+    }
+  }
+
+  void Run() override;
+
+  virtual ~SetValueCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/slice_compute.cc b/lite/kernels/loongarch/slice_compute.cc
new file mode 100644
index 00000000000..356410b88b5
--- /dev/null
+++ b/lite/kernels/loongarch/slice_compute.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/slice_compute.h"
+
+REGISTER_LITE_KERNEL(slice,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SliceCompute<float>,
+                     def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(slice,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SliceCompute<float>,
+                     array_def)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(slice,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SliceCompute<int>,
+                     int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(slice,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SliceCompute<int>,
+                     array_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(slice,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SliceCompute<int64_t>,
+                     int64)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(slice,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SliceCompute<int64_t>,
+                     array_int64)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/slice_compute.h b/lite/kernels/loongarch/slice_compute.h
new file mode 100644
index 00000000000..1f2bd34ef12
--- /dev/null
+++ b/lite/kernels/loongarch/slice_compute.h
@@ -0,0 +1,417 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <algorithm>
+#include <vector>
+#include "lite/backends/loongarch/fluid/eigen.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/relu_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+void DealTensorArray(const std::vector<lite::Tensor>* XTensorList,
+                     std::vector<lite::Tensor>* OutTensorList,
+                     lite::Tensor* Out,
+                     const std::vector<int>& starts,
+                     const std::vector<int>& ends,
+                     bool out_is_array) {
+  auto in_array = XTensorList;
+  // If the input is LoDTensorArray, the rank of input is 1.
+  int64_t in_size = in_array->size();
+  int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
+  int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
+
+  start = std::max(start, static_cast<int64_t>(0));
+  end = std::max(end, static_cast<int64_t>(0));
+  end = std::min(end, in_size);
+
+  CHECK_GT(end, start) << "end should greater than start";
+  int64_t out_size = end - start;
+
+  if (out_is_array) {
+    auto out_array = OutTensorList;
+    out_array->resize(out_size);
+    for (int i = 0; i < out_size; ++i) {
+      auto* out_tensor = &out_array->at(i);
+      auto in_tensor = in_array->at(i + start);
+      out_tensor->set_lod(in_tensor.lod());
+      if (in_tensor.memory_size() > 0) {
+        out_tensor->CopyDataFrom(in_tensor);
+      } else {
+        VLOG(4) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                   "nothing has been written to output array["
+                << i << "].";
+      }
+    }
+  } else {
+    auto out_tensor = Out;
+    auto in_tensor = in_array->at(start);
+    out_tensor->CopyDataFrom(in_tensor);
+  }
+}
+
+inline std::vector<int> GetIntDataFromTensorList(
+    const std::vector<lite::Tensor*>& list_tensor) {
+  std::vector<int> vec_data;
+  for (auto& tensor_i : list_tensor) {
+    CHECK_EQ(tensor_i->dims(), DDim({1}))
+        << "shape of dim tensor should be [1]";
+    auto precision = tensor_i->precision();
+    switch (precision) {
+      case PRECISION(kInt32): {
+        vec_data.push_back(*tensor_i->data<int>());
+        break;
+      }
+      case PRECISION(kInt64): {
+        vec_data.push_back(static_cast<int>(*tensor_i->data<int64_t>()));
+        break;
+      }
+      default: {
+        LOG(FATAL) << "unsupported data precision: "
+                   << lite_api::PrecisionToStr(precision);
+        break;
+      }
+    }
+  }
+  return vec_data;
+}
+
+inline std::vector<int> GetIntDataFromTensor(const Tensor* tensor) {
+  std::vector<int> vec_data;
+  auto precision = tensor->precision();
+  switch (precision) {
+    case PRECISION(kInt32): {
+      const int* data = tensor->data<int>();
+      vec_data = std::vector<int>(data, data + tensor->numel());
+      break;
+    }
+    case PRECISION(kInt64): {
+      const int64_t* data = tensor->data<int64_t>();
+      for (int64_t i = 0; i < tensor->numel(); i++) {
+        vec_data.push_back(static_cast<int>(data[i]));
+      }
+      break;
+    }
+    default: {
+      LOG(FATAL) << "unsupported data precision: "
+                 << lite_api::PrecisionToStr(precision);
+      break;
+    }
+  }
+  return vec_data;
+}
+
+template <class T, size_t D>
+void slice_compute(const lite::Tensor* in,
+                   lite::Tensor* out,
+                   std::vector<int> axes,
+                   std::vector<int> starts,
+                   std::vector<int> ends,
+                   std::vector<int> decrease_axis,
+                   const lite::Tensor* StartsTensor,
+                   const lite::Tensor* EndsTensor,
+                   std::vector<lite::Tensor*> StartsTensorList,
+                   std::vector<lite::Tensor*> EndsTensorList,
+                   std::vector<int> infer_flags) {
+  auto out_dims = out->dims();
+  auto in_dims = in->dims();
+
+  bool need_infer = false;
+  if (StartsTensor || EndsTensor) {
+    need_infer = true;
+  } else if (StartsTensorList.size() > 0 || EndsTensorList.size() > 0) {
+    need_infer = true;
+  }
+
+  if (need_infer) {
+    if (StartsTensor) {
+      starts = GetIntDataFromTensor(StartsTensor);
+    } else if (StartsTensorList.size() > 0) {
+      starts = GetIntDataFromTensorList(StartsTensorList);
+    }
+    CHECK_EQ(starts.size(), axes.size())
+        << "The size of starts must be equal to the size of axes.";
+    if (EndsTensor) {
+      ends = GetIntDataFromTensor(EndsTensor);
+    } else if (EndsTensorList.size() > 0) {
+      ends = GetIntDataFromTensorList(EndsTensorList);
+    }
+    CHECK_EQ(ends.size(), axes.size())
+        << "The size of ends must be equal to the size of axes.";
+    out_dims = in_dims;
+    int dim_value, start, end;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = out_dims[axes[i]];
+      if (dim_value > 0) {
+        // when end = start + 1 and start == -1
+        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+          auto ret =
+              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+          if (ret != decrease_axis.end()) {
+            ends[i] = 10000000;
+          }
+        }
+
+        start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
+        end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
+        start = (std::max)(start, 0);
+        end = (std::max)(end, 0);
+        end = (std::min)(end, dim_value);
+        CHECK_GT(end, start) << "end should greater than start";
+        out_dims[axes[i]] = end - start;
+      }
+    }
+    out->Resize(out_dims);
+    // generate new shape
+    if (decrease_axis.size() > 0) {
+      std::vector<int64_t> new_out_shape;
+      for (size_t i = 0; i < decrease_axis.size(); ++i) {
+        CHECK_EQ(out_dims[decrease_axis[i]], 1) << "decrease dim should be 1";
+        out_dims[decrease_axis[i]] = 0;
+      }
+
+      for (size_t i = 0; i < out_dims.size(); ++i) {
+        if (out_dims[i] != 0) {
+          new_out_shape.push_back(out_dims[i]);
+        }
+      }
+      if (new_out_shape.size() == 0) {
+        new_out_shape.push_back(1);
+      }
+
+      DDim new_dims;
+      new_dims.ConstructFrom(new_out_shape);
+      out_dims = new_dims;
+    }
+  }
+
+  // resize out_dims
+  if (decrease_axis.size() > 0) {
+    if (decrease_axis.size() == static_cast<size_t>(in_dims.size())) {
+      std::vector<int64_t> vec_origin_out_shape(decrease_axis.size(), 1);
+      // lite::DDim dims(vec_origin_out_shape);
+      out->Resize(vec_origin_out_shape);
+    } else {
+      std::vector<int64_t> vec_origin_out_shape(
+          out_dims.size() + decrease_axis.size(), -1);
+      for (size_t i = 0; i < decrease_axis.size(); ++i) {
+        vec_origin_out_shape[decrease_axis[i]] = 1;
+      }
+      int index = 0;
+      for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
+        if (-1 == vec_origin_out_shape[i]) {
+          vec_origin_out_shape[i] = out_dims[index];
+          ++index;
+        }
+      }
+      // lite::DDim dims(vec_origin_out_shape);
+      out->Resize(vec_origin_out_shape);
+    }
+  }
+
+  out->mutable_data<T>();
+
+  auto new_out_dims = out->dims();
+  auto offsets = Eigen::array<int, D>();
+  auto extents = Eigen::array<int, D>();
+  for (size_t i = 0; i < D; ++i) {
+    offsets[i] = 0;
+    extents[i] = new_out_dims[i];
+  }
+  int start;
+  for (size_t i = 0; i < axes.size(); ++i) {
+    start = starts[i];
+    if (start < 0) {
+      start = (start + in_dims[axes[i]]);
+    }
+    start = (std::max)(start, 0);
+    offsets[axes[i]] = start;
+  }
+  auto in_t =
+      lite::fluid::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+          *in, in->dims());
+  auto out_t =
+      lite::fluid::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+          *out, new_out_dims);
+  out_t = in_t.slice(offsets, extents);
+
+  out->Resize(out_dims);
+}
+
+template <class T>
+void slice_compute_(const lite::Tensor* Input,
+                    lite::Tensor* Out,
+                    const std::vector<lite::Tensor>* XTensorList,
+                    std::vector<lite::Tensor>* OutTensorList,
+                    std::vector<int> axes,
+                    std::vector<int> starts,
+                    std::vector<int> ends,
+                    std::vector<int> decrease_axis,
+                    const lite::Tensor* StartsTensor,
+                    const lite::Tensor* EndsTensor,
+                    std::vector<lite::Tensor*> StartsTensorList,
+                    std::vector<lite::Tensor*> EndsTensorList,
+                    std::vector<int> infer_flags) {
+  if (Input == nullptr && XTensorList != nullptr) {
+    bool need_infer = false;
+    if (StartsTensor || EndsTensor) {
+      need_infer = true;
+    }
+    if (StartsTensorList.size() > 0 || EndsTensorList.size() > 0) {
+      need_infer = true;
+    }
+    if (need_infer) {
+      if (StartsTensor) {
+        starts = GetIntDataFromTensor(StartsTensor);
+      } else if (StartsTensorList.size() > 0) {
+        starts = GetIntDataFromTensorList(StartsTensorList);
+      }
+      CHECK_EQ(starts.size(), axes.size())
+          << "The size of starts must be equal to the size of axes.";
+      if (EndsTensor) {
+        ends = GetIntDataFromTensor(EndsTensor);
+      } else if (EndsTensorList.size() > 0) {
+        ends = GetIntDataFromTensorList(EndsTensorList);
+      }
+      CHECK_EQ(ends.size(), axes.size())
+          << "The size of starts must be equal to the size of axes.";
+    }
+    DealTensorArray(XTensorList,
+                    OutTensorList,
+                    Out,
+                    starts,
+                    ends,
+                    (Out == nullptr && OutTensorList != nullptr));
+    return;
+  }
+  int rank = Input->dims().size();
+  switch (rank) {
+    case 1:
+      slice_compute<T, 1>(Input,
+                          Out,
+                          axes,
+                          starts,
+                          ends,
+                          decrease_axis,
+                          StartsTensor,
+                          EndsTensor,
+                          StartsTensorList,
+                          EndsTensorList,
+                          infer_flags);
+      break;
+    case 2:
+      slice_compute<T, 2>(Input,
+                          Out,
+                          axes,
+                          starts,
+                          ends,
+                          decrease_axis,
+                          StartsTensor,
+                          EndsTensor,
+                          StartsTensorList,
+                          EndsTensorList,
+                          infer_flags);
+      break;
+    case 3:
+      slice_compute<T, 3>(Input,
+                          Out,
+                          axes,
+                          starts,
+                          ends,
+                          decrease_axis,
+                          StartsTensor,
+                          EndsTensor,
+                          StartsTensorList,
+                          EndsTensorList,
+                          infer_flags);
+      break;
+    case 4:
+      slice_compute<T, 4>(Input,
+                          Out,
+                          axes,
+                          starts,
+                          ends,
+                          decrease_axis,
+                          StartsTensor,
+                          EndsTensor,
+                          StartsTensorList,
+                          EndsTensorList,
+                          infer_flags);
+      break;
+    case 5:
+      slice_compute<T, 5>(Input,
+                          Out,
+                          axes,
+                          starts,
+                          ends,
+                          decrease_axis,
+                          StartsTensor,
+                          EndsTensor,
+                          StartsTensorList,
+                          EndsTensorList,
+                          infer_flags);
+      break;
+    case 6:
+      slice_compute<T, 6>(Input,
+                          Out,
+                          axes,
+                          starts,
+                          ends,
+                          decrease_axis,
+                          StartsTensor,
+                          EndsTensor,
+                          StartsTensorList,
+                          EndsTensorList,
+                          infer_flags);
+      break;
+  }
+}
+
+template <typename T>
+class SliceCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SliceParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    slice_compute_<T>(param.X,
+                      param.Out,
+                      param.XTensorList,
+                      param.OutTensorList,
+                      param.axes,
+                      param.starts,
+                      param.ends,
+                      param.decrease_axis,
+                      param.StartsTensor,
+                      param.EndsTensor,
+                      param.StartsTensorList,
+                      param.EndsTensorList,
+                      param.infer_flags);
+  }
+
+  virtual ~SliceCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/softmax_compute.cc b/lite/kernels/loongarch/softmax_compute.cc
new file mode 100644
index 00000000000..21699536bb5
--- /dev/null
+++ b/lite/kernels/loongarch/softmax_compute.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/softmax_compute.h"
+
+REGISTER_LITE_KERNEL(softmax,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SoftmaxCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+REGISTER_LITE_KERNEL(search_seq_softmax,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::SoftmaxCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/softmax_compute.h b/lite/kernels/loongarch/softmax_compute.h
new file mode 100644
index 00000000000..ea737ffc5e9
--- /dev/null
+++ b/lite/kernels/loongarch/softmax_compute.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/math/softmax.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, const DDim& dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, const DDim& dims) {
+  int size = 1;
+  for (size_t i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+template <typename T>
+class SoftmaxCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SoftmaxParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SoftmaxParam>();
+    auto& context = ctx_->As<LoongArchContext>();
+    CHECK(param.output);
+    CHECK(param.x);
+
+    auto* x = param.x;
+    auto* output = param.output;
+    auto out_ptr = output->template mutable_data<T>();
+
+    const int rank = x->dims().size();
+    const int axis = CanonicalAxis(param.axis, rank);
+    int axis_dim = 0;
+    if (rank == 2 && axis == 1) {
+      axis_dim = x->dims()[axis];
+      lite::loongarch::math::SoftmaxFunctor<lite::TargetType::kLoongArch, T, true>()(
+          context, axis_dim, x, output);
+    } else if (rank == 0) {
+      output->Resize(x->dims());
+      out_ptr[0] = 1;
+    } else {
+      const int n = SizeToAxis(axis, x->dims());
+      const int d = SizeFromAxis(axis, x->dims());
+      DDim x_dims = x->dims();
+      DDim out_dims = output->dims();
+      DDim shape_2d(std::vector<DDim::value_type>{n, d});
+      x->Resize(shape_2d);
+      output->Resize(shape_2d);
+      axis_dim = x->dims()[axis];
+      lite::loongarch::math::SoftmaxFunctor<lite::TargetType::kLoongArch, T, true>()(
+          context, axis_dim, x, output);
+      x->Resize(x_dims);
+      output->Resize(out_dims);
+    }
+  }
+
+  virtual ~SoftmaxCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/softmax_compute_test.cc b/lite/kernels/loongarch/softmax_compute_test.cc
new file mode 100644
index 00000000000..60004e26b22
--- /dev/null
+++ b/lite/kernels/loongarch/softmax_compute_test.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/softmax_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+TEST(softmax_loongarch, retrive_op) {
+  auto softmax = KernelRegistry::Global().Create("softmax");
+  ASSERT_FALSE(softmax.empty());
+  ASSERT_TRUE(softmax.front());
+}
+
+TEST(softmax_loongarch, init) {
+  SoftmaxCompute<float> softmax;
+  ASSERT_EQ(softmax.precision(), PRECISION(kFloat));
+  ASSERT_EQ(softmax.target(), TARGET(kLoongArch));
+}
+
+TEST(softmax_loongarch, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  SoftmaxCompute<float> softmax;
+  operators::SoftmaxParam param;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  softmax.SetContext(std::move(ctx));
+
+  param.x = &x;
+  param.output = &out;
+
+  softmax.SetParam(param);
+  softmax.Run();
+
+  std::vector<float> ref_results = {
+      0.0900306f, 0.244728f, 0.665241f, 0.0900306f, 0.244728f, 0.665241f,
+      0.0900306f, 0.244728f, 0.665241f, 0.0900306f, 0.244728f, 0.665241f,
+      0.0900306f, 0.244728f, 0.665241f, 0.0900306f, 0.244728f, 0.665241f,
+      0.0900306f, 0.244728f, 0.665241f, 0.0900306f, 0.244728f, 0.665241f,
+      0.0900306f, 0.244728f, 0.665241f};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(softmax, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/stack_compute.cc b/lite/kernels/loongarch/stack_compute.cc
new file mode 100644
index 00000000000..cf358754c9c
--- /dev/null
+++ b/lite/kernels/loongarch/stack_compute.cc
@@ -0,0 +1,24 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/stack_compute.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+using stack_float =
+    paddle::lite::kernels::host::StackCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(stack, kLoongArch, kFloat, kNCHW, stack_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/transpose_compute.cc b/lite/kernels/loongarch/transpose_compute.cc
new file mode 100644
index 00000000000..feaae61eed0
--- /dev/null
+++ b/lite/kernels/loongarch/transpose_compute.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/transpose_compute.h"
+
+REGISTER_LITE_KERNEL(transpose,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::TransposeCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose2,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::Transpose2Compute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::TransposeCompute<int32_t>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose2,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::Transpose2Compute<int32_t>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::TransposeCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose2,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::Transpose2Compute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kLoongArch), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/transpose_compute.h b/lite/kernels/loongarch/transpose_compute.h
new file mode 100644
index 00000000000..150d9222bc0
--- /dev/null
+++ b/lite/kernels/loongarch/transpose_compute.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Eigen/Core>
+#include <vector>
+#include "lite/backends/loongarch/math/math_function.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/transpose_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <lite::TargetType Target, typename T>
+inline void TransCompute(const int dim,
+                         const lite::Context<Target>& context,
+                         const lite::Tensor& in,
+                         lite::Tensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      paddle::lite::loongarch::math::Transpose<lite::TargetType::kLoongArch, T, 1> trans1;
+      trans1(context, in, out, axis);
+      break;
+    case 2:
+      paddle::lite::loongarch::math::Transpose<lite::TargetType::kLoongArch, T, 2> trans2;
+      trans2(context, in, out, axis);
+      break;
+    case 3:
+      paddle::lite::loongarch::math::Transpose<lite::TargetType::kLoongArch, T, 3> trans3;
+      trans3(context, in, out, axis);
+      break;
+    case 4:
+      paddle::lite::loongarch::math::Transpose<lite::TargetType::kLoongArch, T, 4> trans4;
+      trans4(context, in, out, axis);
+      break;
+    case 5:
+      paddle::lite::loongarch::math::Transpose<lite::TargetType::kLoongArch, T, 5> trans5;
+      trans5(context, in, out, axis);
+      break;
+    case 6:
+      paddle::lite::loongarch::math::Transpose<lite::TargetType::kLoongArch, T, 6> trans6;
+      trans6(context, in, out, axis);
+      break;
+    default:
+      LOG(FATAL) << "Tensors with rank at most 6 are supported";
+  }
+}
+
+template <typename T>
+class TransposeCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto* x = param.x;
+    auto* out = param.output;
+    auto* x_ptr = x->template data<T>();
+    auto* out_ptr = out->template mutable_data<T>();
+    int ndims = param.axis.size();
+    auto& context = ctx_->As<LoongArchContext>();
+    if (!param.x->dims().size()) {
+      out_ptr[0] = x_ptr[0];
+      return;
+    }
+    TransCompute<lite::TargetType::kLoongArch, T>(
+        ndims, context, *x, out, param.axis);
+  }
+
+  virtual ~TransposeCompute() = default;
+};
+
+template <typename T>
+class Transpose2Compute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto* x = param.x;
+    auto* out = param.output;
+    auto* x_ptr = x->template data<T>();
+    auto* out_ptr = out->template mutable_data<T>();
+    int ndims = param.axis.size();
+    auto& context = ctx_->As<LoongArchContext>();
+    if (!param.x->dims().size()) {
+      out_ptr[0] = x_ptr[0];
+      return;
+    }
+    TransCompute<lite::TargetType::kLoongArch, T>(
+        ndims, context, *x, out, param.axis);
+  }
+
+  virtual ~Transpose2Compute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/transpose_compute_test.cc b/lite/kernels/loongarch/transpose_compute_test.cc
new file mode 100644
index 00000000000..cbf8eddbfe8
--- /dev/null
+++ b/lite/kernels/loongarch/transpose_compute_test.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/loongarch/transpose_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+// transpose
+TEST(transpose_loongarch, retrive_op) {
+  auto transpose = KernelRegistry::Global().Create("transpose");
+  ASSERT_FALSE(transpose.empty());
+  ASSERT_TRUE(transpose.front());
+}
+
+TEST(transpose_loongarch, init) {
+  lite::kernels::loongarch::TransposeCompute<float> transpose;
+  ASSERT_EQ(transpose.precision(), PRECISION(kFloat));
+  ASSERT_EQ(transpose.target(), TARGET(kLoongArch));
+}
+
+TEST(transpose_loongarch, run_test) {
+  lite::Tensor x;
+  lite::Tensor out;
+  std::vector<int64_t> x_shape({3, 4, 5});
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape({3, 5, 4});
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+
+  // TransposeCompute transpose;
+  TransposeCompute<float> transpose;
+  operators::TransposeParam param;
+
+  param.x = &x;
+  param.output = &out;
+  std::vector<int> axis({0, 2, 1});
+  param.axis = axis;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  transpose.SetContext(std::move(ctx));
+  transpose.SetParam(param);
+  transpose.Run();
+
+  for (int j = 0; j < out.dims().production(); ++j) {
+    // EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
+    LOG(INFO) << out_data[j];
+  }
+}
+
+// transpose2
+TEST(transpose2_loongarch, retrive_op) {
+  auto transpose2 = KernelRegistry::Global().Create("transpose2");
+  ASSERT_FALSE(transpose2.empty());
+  ASSERT_TRUE(transpose2.front());
+}
+
+TEST(transpose2_loongarch, init) {
+  lite::kernels::loongarch::Transpose2Compute<float> transpose2;
+  ASSERT_EQ(transpose2.precision(), PRECISION(kFloat));
+  ASSERT_EQ(transpose2.target(), TARGET(kLoongArch));
+}
+
+TEST(transpose2_loongarch, run_test) {
+  lite::Tensor x;
+  lite::Tensor xshape;
+  lite::Tensor out;
+  std::vector<int64_t> x_shape({3, 4, 5});
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape({3, 5, 4});
+  out.Resize(lite::DDim(out_shape));
+  std::vector<int64_t> xshape_shape({3, 4, 5});
+  xshape.Resize(lite::DDim(xshape_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto xshape_data = xshape.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+    xshape_data[i] = static_cast<float>(i);
+  }
+
+  // Transpose2Compute transpose2;
+  Transpose2Compute<float> transpose2;
+  operators::TransposeParam param;
+
+  param.x = &x;
+  param.output = &out;
+  param.xshape = &xshape;
+  std::vector<int> axis({0, 2, 1});
+  param.axis = axis;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+  transpose2.SetContext(std::move(ctx));
+  transpose2.SetParam(param);
+  transpose2.Run();
+
+  for (int j = 0; j < out.dims().production(); ++j) {
+    LOG(INFO) << out_data[j];
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(transpose, kLoongArch, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose2, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/kernels/loongarch/var_conv_2d_compute.cc b/lite/kernels/loongarch/var_conv_2d_compute.cc
new file mode 100644
index 00000000000..112cc83da88
--- /dev/null
+++ b/lite/kernels/loongarch/var_conv_2d_compute.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/loongarch/var_conv_2d_compute.h"
+
+REGISTER_LITE_KERNEL(var_conv_2d,
+                     kLoongArch,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::loongarch::VarConv2DCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kLoongArch))})
+    .Finalize();
diff --git a/lite/kernels/loongarch/var_conv_2d_compute.h b/lite/kernels/loongarch/var_conv_2d_compute.h
new file mode 100644
index 00000000000..c3bbd8ece06
--- /dev/null
+++ b/lite/kernels/loongarch/var_conv_2d_compute.h
@@ -0,0 +1,214 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/backends/loongarch/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+template <typename T>
+class VarConv2DCompute : public KernelLite<TARGET(kLoongArch), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::VarConv2DParam;
+
+  void Im2Col(const lite::Tensor& input, lite::Tensor* col) const {
+    auto& param = *param_.get_mutable<param_t>();
+    int input_channel = param.input_channel;
+    int kernel_h = param.kernel_h;
+    int kernel_w = param.kernel_w;
+    int stride_h = param.stride_h;
+    int stride_w = param.stride_w;
+    // auto* in_row = param.ROW;
+    // auto* in_col = param.COLUMN;
+
+    int batch = input.lod()[0].size() - 1;
+    const auto& bottom_offset = input.lod()[0];
+    // 2-D lod info.
+    // const auto& offset_x = in_col->lod()[0];
+    // const auto& offset_y = in_row->lod()[0];
+    CHECK_EQ(param.X->lod().size(), 3u) << "input lod size should be 3!";
+    const auto& offset_y = param.X->lod()[1];
+    const auto& offset_x = param.X->lod()[2];
+
+    // top offset is the whole size of each data sample
+    std::vector<uint64_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (int b = 0; b < batch; ++b) {
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      int top_im_x = 0;
+      if (width == 0) {
+        top_im_x = 0;
+      } else {
+        top_im_x = (width - 1) / stride_w + 1;
+      }
+      int top_im_y = 0;
+      if (height == 0) {
+        top_im_y = 0;
+      } else {
+        top_im_y = (height - 1) / stride_h + 1;
+      }
+      int top_x = top_im_x * top_im_y;
+      int top_y = input_channel * kernel_h * kernel_w;
+      top_size += top_y * top_x;
+      top_offset.push_back(top_size);
+    }
+    // std::vector<int64_t> col_lod_vec;
+    // col_lod_vec.push_back(top_offset);
+    LoD col_lod;
+    col_lod.push_back(top_offset);
+    col->set_lod(col_lod);
+    std::vector<int64_t> col_dims_vec{top_size};
+    col_dims_vec.push_back(1);
+    col->Resize(col_dims_vec);
+    auto* top_data = col->template mutable_data<T>();
+    const auto* bottom_data = input.data<T>();
+
+    int kernel_win_size = kernel_h * kernel_w;
+    int half_kernel_h = kernel_h / 2;
+    int half_kernel_w = kernel_w / 2;
+    for (int b = 0; b < batch; ++b) {
+      int t_offset = top_offset[b];
+      int b_offset = bottom_offset[b];
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      if (width == 0 || height == 0) {
+        continue;
+      }
+      int top_im_x = (width - 1) / stride_w + 1;
+      int top_im_y = (height - 1) / stride_h + 1;
+      int top_x = top_im_y * top_im_x;
+      for (int z = 0; z < input_channel; ++z) {
+        int row_offset = kernel_win_size * z;
+        int im_offset = z * width * height;
+        for (int y = 0; y < height; y += stride_h) {
+          for (int x = 0; x < width; x += stride_w) {
+            int col_offset = x / stride_w + y / stride_h * top_im_x;
+            for (int ky = 0; ky < kernel_h; ++ky) {
+              for (int kx = 0; kx < kernel_w; ++kx) {
+                int im_y = y + ky - half_kernel_h;
+                int im_x = x + kx - half_kernel_w;
+                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                  top_data[t_offset +
+                           (row_offset + ky * kernel_w + kx) * top_x +
+                           col_offset] =
+                      bottom_data[b_offset + im_offset + im_y * width + im_x];
+                } else {
+                  top_data[t_offset +
+                           (row_offset + ky * kernel_w + kx) * top_x +
+                           col_offset] = 0;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<LoongArchContext>();
+    auto* bottom = param.X;
+    // auto* in_row = param.ROW;
+    // auto* in_col = param.COLUMN;
+    auto* w = param.W;
+    auto* top = param.Out;
+    auto* col = param.Col;
+
+    int output_channel = param.output_channel;
+    int input_channel = param.input_channel;
+    int kernel_h = param.kernel_h;
+    int kernel_w = param.kernel_w;
+    int stride_h = param.stride_h;
+    int stride_w = param.stride_w;
+
+    Im2Col(*bottom, col);
+    int batch = bottom->lod()[0].size() - 1;
+    const auto& col_offset = col->lod()[0];
+    // const auto& offset_x = in_col->lod()[0];
+    // const auto& offset_y = in_row->lod()[0];
+    const auto& offset_y = param.X->lod()[1];
+    const auto& offset_x = param.X->lod()[2];
+    std::vector<uint64_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (int b = 0; b < batch; ++b) {
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      int top_im_x = 0;
+      if (width == 0) {
+        top_im_x = 0;
+      } else {
+        top_im_x = (width - 1) / stride_w + 1;
+      }
+      int top_im_y = 0;
+      if (height == 0) {
+        top_im_y = 0;
+      } else {
+        top_im_y = (height - 1) / stride_h + 1;
+      }
+      int top_im_size = top_im_y * top_im_x;
+      top_size += output_channel * top_im_size;
+      top_offset.push_back(top_size);
+    }
+
+    LoD top_lod;
+    top_lod.push_back(top_offset);
+    top->set_lod(top_lod);
+    std::vector<int64_t> top_dims_vec{top_size};
+    top_dims_vec.push_back(1);
+    top->Resize(top_dims_vec);
+    auto* top_data = top->template mutable_data<T>();
+    const auto* w_data = w->template data<T>();
+    const auto* col_data = col->template data<T>();
+
+    auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, T>(context);
+    for (int b = 0; b < batch; ++b) {
+      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+      if (top_im_size == 0) {
+        continue;
+      }
+
+      blas.GEMM(false,
+                false,
+                output_channel,
+                top_im_size,
+                input_channel * kernel_h * kernel_w,
+                1.0,
+                w_data,
+                input_channel * kernel_h * kernel_w,
+                col_data + col_offset[b],
+                top_im_size,
+                0.0,
+                top_data + top_offset[b],
+                top_im_size);
+    }
+  }
+
+  virtual ~VarConv2DCompute() = default;
+};
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/loongarch/var_conv_2d_compute_test.cc b/lite/kernels/loongarch/var_conv_2d_compute_test.cc
new file mode 100644
index 00000000000..b98efb9d223
--- /dev/null
+++ b/lite/kernels/loongarch/var_conv_2d_compute_test.cc
@@ -0,0 +1,316 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/loongarch/var_conv_2d_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace loongarch {
+
+static void im2col_ref(const lite::Tensor& input,
+                       const lite::Tensor* in_row,
+                       const lite::Tensor* in_col,
+                       const int kernel_h,
+                       const int kernel_w,
+                       const int stride_h,
+                       const int stride_w,
+                       const int input_channel,
+                       lite::Tensor* col) {
+  int batch = input.lod()[0].size() - 1;
+  const auto& bottom_offset = input.lod()[0];
+  // 2-D lod info.
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+
+  // top offset is the whole size of each data sample
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_x = top_im_x * top_im_y;
+    int top_y = input_channel * kernel_h * kernel_w;
+    top_size += top_y * top_x;
+    top_offset.push_back(top_size);
+  }
+  LoD col_lod;
+  col_lod.push_back(top_offset);
+  col->set_lod(col_lod);
+  std::vector<int64_t> col_dims_vec{top_size};
+  col_dims_vec.push_back(1);
+  col->Resize(col_dims_vec);
+  auto* top_data = col->mutable_data<float>();
+  const auto* bottom_data = input.data<float>();
+
+  int kernel_win_size = kernel_h * kernel_w;
+  int half_kernel_h = kernel_h / 2;
+  int half_kernel_w = kernel_w / 2;
+  for (int b = 0; b < batch; ++b) {
+    int t_offset = top_offset[b];
+    int b_offset = bottom_offset[b];
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    if (width == 0 || height == 0) {
+      continue;
+    }
+    int top_im_x = (width - 1) / stride_w + 1;
+    int top_im_y = (height - 1) / stride_h + 1;
+    int top_x = top_im_y * top_im_x;
+    for (int z = 0; z < input_channel; ++z) {
+      int row_offset = kernel_win_size * z;
+      int im_offset = z * width * height;
+      for (int y = 0; y < height; y += stride_h) {
+        for (int x = 0; x < width; x += stride_w) {
+          int col_offset = x / stride_w + y / stride_h * top_im_x;
+          for (int ky = 0; ky < kernel_h; ++ky) {
+            for (int kx = 0; kx < kernel_w; ++kx) {
+              int im_y = y + ky - half_kernel_h;
+              int im_x = x + kx - half_kernel_w;
+              if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] =
+                    bottom_data[b_offset + im_offset + im_y * width + im_x];
+              } else {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void var_conv_2d_ref(const lite::Tensor* bottom,
+                            const lite::Tensor* w,
+                            const lite::Tensor* in_row,
+                            const lite::Tensor* in_col,
+                            const int kernel_h,
+                            const int kernel_w,
+                            const int stride_h,
+                            const int stride_w,
+                            const int input_channel,
+                            const int output_channel,
+                            lite::Tensor* top,
+                            lite::Tensor* col) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<LoongArchContext>();
+
+  im2col_ref(*bottom,
+             in_row,
+             in_col,
+             kernel_h,
+             kernel_w,
+             stride_h,
+             stride_w,
+             input_channel,
+             col);
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& col_offset = col->lod()[0];
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+  auto* top_data = top->mutable_data<float>();
+  const auto* w_data = w->data<float>();
+  const auto* col_data = col->data<float>();
+
+  auto blas = lite::loongarch::math::GetBlas<lite::TargetType::kLoongArch, float>(context);
+  for (int b = 0; b < batch; ++b) {
+    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+    if (top_im_size == 0) {
+      continue;
+    }
+
+    blas.GEMM(false,
+              false,
+              output_channel,
+              top_im_size,
+              input_channel * kernel_h * kernel_w,
+              1.0,
+              w_data,
+              input_channel * kernel_h * kernel_w,
+              col_data + col_offset[b],
+              top_im_size,
+              0.0,
+              top_data + top_offset[b],
+              top_im_size);
+  }
+}
+
+TEST(var_conv_2d_loongarch, retrive_op) {
+  auto var_conv_2d = KernelRegistry::Global().Create("var_conv_2d");
+  ASSERT_FALSE(var_conv_2d.empty());
+  ASSERT_TRUE(var_conv_2d.front());
+}
+
+TEST(var_conv_2d_loongarch, init) {
+  VarConv2DCompute<float> var_conv_2d;
+  ASSERT_EQ(var_conv_2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(var_conv_2d.target(), TARGET(kLoongArch));
+}
+
+TEST(var_conv_2d_loongarch, run_test) {
+  VarConv2DCompute<float> var_conv_2d;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<LoongArchContext>();
+
+  operators::VarConv2DParam param;
+
+  lite::Tensor X, W, ROW, COLUMN;
+  lite::Tensor Out, Col;
+  int kernel_h, kernel_w;
+  int stride_h, stride_w;
+  int input_channel, output_channel;
+
+  output_channel = 5;
+  input_channel = 5;
+  kernel_h = 5;
+  kernel_w = 5;
+  stride_h = 1;
+  stride_w = 1;
+  std::vector<int64_t> w_dims_vec;
+  w_dims_vec.push_back(output_channel);
+  w_dims_vec.push_back(input_channel * kernel_h * kernel_w);
+  W.Resize(w_dims_vec);
+  auto* w_data = W.mutable_data<float>();
+  for (int i = 0; i < W.numel(); ++i) {
+    w_data[i] = i - 1.f;
+  }
+
+  std::vector<uint64_t> row_lod_vec{0, 10, 20};
+  LoD row_lod;
+  row_lod.push_back(row_lod_vec);
+  ROW.set_lod(row_lod);
+
+  std::vector<uint64_t> column_lod_vec{0, 10, 20};
+  LoD column_lod;
+  column_lod.push_back(column_lod_vec);
+  COLUMN.set_lod(column_lod);
+
+  int x_size = 0;
+  std::vector<uint64_t> x_lod_vec;
+  x_lod_vec.push_back(0);
+  for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) {
+    int height = row_lod_vec[i + 1] - row_lod_vec[i];
+    int width = column_lod_vec[i + 1] - column_lod_vec[i];
+    x_lod_vec.push_back(height * width * input_channel);
+    x_size += height * width * input_channel;
+  }
+  std::vector<int64_t> x_dims_vec{x_size, 1};
+  LoD x_lod;
+  x_lod.push_back(x_lod_vec);
+  x_lod.push_back(row_lod_vec);
+  x_lod.push_back(column_lod_vec);
+  X.Resize(x_dims_vec);
+  X.set_lod(x_lod);
+  auto* x_data = X.mutable_data<float>();
+  for (int i = 0; i < X.numel(); ++i) {
+    x_data[i] = i % 20 * 1.f;
+  }
+
+  param.X = &X;
+  param.W = &W;
+  // param.ROW = &ROW;
+  // param.COLUMN = &COLUMN;
+  param.Out = &Out;
+  param.Col = &Col;
+  param.stride_h = stride_h;
+  param.stride_w = stride_w;
+  param.kernel_h = kernel_h;
+  param.kernel_w = kernel_w;
+  param.input_channel = input_channel;
+  param.output_channel = output_channel;
+  var_conv_2d.SetParam(param);
+  var_conv_2d.SetContext(std::move(ctx));
+  var_conv_2d.Run();
+
+  lite::Tensor top_ref, col_ref;
+  var_conv_2d_ref(&X,
+                  &W,
+                  &ROW,
+                  &COLUMN,
+                  kernel_h,
+                  kernel_w,
+                  stride_h,
+                  stride_w,
+                  input_channel,
+                  output_channel,
+                  &top_ref,
+                  &col_ref);
+
+  for (int i = 0; i < Out.numel(); ++i) {
+    EXPECT_NEAR(Out.data<float>()[i], top_ref.data<float>()[i], 1e-5);
+  }
+  for (int i = 0; i < Col.numel(); ++i) {
+    EXPECT_NEAR(Col.data<float>()[i], col_ref.data<float>()[i], 1e-5);
+  }
+}
+
+}  // namespace loongarch
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(var_conv_2d, kLoongArch, kFloat, kNCHW, def);
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 318d046c65e..566db70dc8d 100755
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -278,7 +278,7 @@ endif()
 add_library(ops STATIC ${OPS_SRC} op_params.cc)
 add_dependencies(ops utils)
 
-if (NOT LITE_WITH_X86)
+if (NOT LITE_WITH_X86 AND NOT LITE_WITH_LOONGARCH)
     if(NOT LITE_WITH_ARM)
         lite_cc_test(test_one_hot_op SRCS one_hot_op_test.cc)
     endif()
diff --git a/lite/operators/tile_op.cc b/lite/operators/tile_op.cc
index c034d56447a..5de95a5687a 100644
--- a/lite/operators/tile_op.cc
+++ b/lite/operators/tile_op.cc
@@ -89,8 +89,8 @@ bool TileOp::InferShapeImpl() const {
     } else {
       CHECK_GT(repeat_times[i], 0)
           << "Every element of the input 'repeat_times' for tile op must be "
-          << "greater than 1, but the value given is ",
-          repeat_times[i];
+          << "greater than 1, but the value given is "
+          << repeat_times[i];
       out_shape[i] = x_dim_vec[i] * repeat_times[i];
     }
   }
diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh
index ace7a8b214e..86a72b274bf 100755
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -27,6 +27,8 @@ OPTMODEL_DIR=""
 # options of compiling x86 lib
 WITH_STATIC_MKL=OFF
 WITH_AVX=ON
+# options of compiling LoongArch
+WITH_LASX=ON
 # options of compiling OPENCL lib.
 WITH_OPENCL=OFF
 # options of compiling Metal lib for Mac OS.
@@ -150,6 +152,11 @@ function init_cmake_mutable_options {
         with_x86=ON
         arm_target_os=""
         WITH_TINY_PUBLISH=OFF
+    elif [ "${ARCH}" == "loongarch" ]; then
+        with_loongarch=ON
+        arm_target_os=""
+        WITH_TINY_PUBLISH=OFF
+        WITH_AVX=OFF
     else
         with_arm=ON
         arm_arch=$ARCH
@@ -183,6 +190,7 @@ function init_cmake_mutable_options {
 
     cmake_mutable_options="-DLITE_WITH_ARM=$with_arm \
                         -DLITE_WITH_X86=$with_x86 \
+                        -DLITE_WITH_LOONGARCH=$with_loongarch \
                         -DARM_TARGET_ARCH_ABI=$arm_arch \
                         -DARM_TARGET_OS=$arm_target_os \
                         -DARM_TARGET_LANG=$TOOLCHAIN \
@@ -197,6 +205,7 @@ function init_cmake_mutable_options {
                         -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
                         -DWITH_STATIC_MKL=$WITH_STATIC_MKL \
                         -DWITH_AVX=$WITH_AVX \
+                        -DWITH_LASX=$WITH_LASX \
                         -DLITE_WITH_OPENCL=$WITH_OPENCL \
                         -DLITE_WITH_METAL=$WITH_METAL \
                         -DLITE_WITH_RKNPU=$WITH_ROCKCHIP_NPU \
@@ -429,6 +438,10 @@ function print_usage {
     echo -e "|     --with_static_mkl: (OFF|ON); controls whether to compile static mkl lib, default is OFF                                                          |"
     echo -e "|     --with_avx: (OFF|ON); controls whether to use avx , default is ON                                                                                |"
     echo -e "|                                                                                                                                                      |"
+    echo -e "|  arguments of loongarch compiling:                                                                                                                   |"
+    echo -e "|     ./lite/tools/build_linux.sh --arch=loongarch                                                                                                     |"
+    echo -e "|     --with_lasx: (OFF|ON); controls whether to use lasx , default is ON                                                                              |"
+    echo -e "|                                                                                                                                                      |"
     echo -e "|  arguments of opencl library compiling:                                                                                                              |"
     echo -e "|     ./lite/tools/build_linux.sh --with_opencl=ON                                                                                                     |"
     echo -e "|     --with_opencl: (OFF|ON); controls whether to compile lib for opencl, default is OFF                                                              |"
@@ -543,6 +556,10 @@ function main {
                 WITH_AVX="${i#*=}"
                 shift
                 ;;
+            --with_lasx=*)
+                WITH_LASX="${i#*=}"
+                shift
+                ;;
             --skip_support_0_dim_tensor_pass=*)
                 SKIP_SUPPORT_0_DIM_TENSOR_PASS="${i#*=}"
                 shift