Skip to content

Commit 6160cf3

Browse files
author
panhehe
committed
[runtime/xpu] 🐻Support the execution of non-streaming parsing on the Kunlun XPU card #1455
1 parent 89e8d0d commit 6160cf3

28 files changed

+3463
-6
lines changed

runtime/core/cmake/xpu.cmake

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
if(NOT WIN32)
2+
string(ASCII 27 Esc)
3+
set(ColourReset "${Esc}[m")
4+
set(ColourBold "${Esc}[1m")
5+
set(Red "${Esc}[31m")
6+
set(Green "${Esc}[32m")
7+
set(Yellow "${Esc}[33m")
8+
set(Blue "${Esc}[34m")
9+
set(Magenta "${Esc}[35m")
10+
set(Cyan "${Esc}[36m")
11+
set(White "${Esc}[37m")
12+
set(BoldRed "${Esc}[1;31m")
13+
set(BoldGreen "${Esc}[1;32m")
14+
set(BoldYellow "${Esc}[1;33m")
15+
set(BoldBlue "${Esc}[1;34m")
16+
set(BoldMagenta "${Esc}[1;35m")
17+
set(BoldCyan "${Esc}[1;36m")
18+
set(BoldWhite "${Esc}[1;37m")
19+
endif()
20+
21+
if(XPU)
22+
set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR})
23+
message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n")
24+
set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu)
25+
if(NOT DEFINED ENV{XPU_API_PATH})
26+
message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n")
27+
else()
28+
set(XPU_API_PATH $ENV{XPU_API_PATH})
29+
message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.")
30+
endif()
31+
32+
include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/
33+
${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include)
34+
link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/)
35+
36+
add_definitions(-DUSE_XPU)
37+
endif()

runtime/core/decoder/CMakeLists.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ set(decoder_srcs
77
ctc_endpoint.cc
88
)
99

10-
if(NOT TORCH AND NOT ONNX)
11-
message(FATAL_ERROR "Please build with TORCH or ONNX!!!")
10+
if(NOT TORCH AND NOT ONNX AND NOT XPU)
11+
message(FATAL_ERROR "Please build with TORCH or ONNX or XPU!!!")
1212
endif()
1313
if(TORCH)
1414
list(APPEND decoder_srcs torch_asr_model.cc)
@@ -18,7 +18,8 @@ if(ONNX)
1818
endif()
1919

2020
add_library(decoder STATIC ${decoder_srcs})
21-
target_link_libraries(decoder PUBLIC kaldi-decoder frontend post_processor utils)
21+
target_link_libraries(decoder PUBLIC kaldi-decoder frontend
22+
post_processor utils)
2223

2324
if(ANDROID)
2425
target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY})
@@ -29,4 +30,7 @@ else()
2930
if(ONNX)
3031
target_link_libraries(decoder PUBLIC onnxruntime)
3132
endif()
33+
if(XPU)
34+
target_link_libraries(decoder PUBLIC xpu_conformer)
35+
endif()
3236
endif()

runtime/core/decoder/params.h

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
// See the License for the specific language governing permissions and
1414
// limitations under the License.
1515

16-
1716
#ifndef DECODER_PARAMS_H_
1817
#define DECODER_PARAMS_H_
1918

@@ -29,17 +28,24 @@
2928
#ifdef USE_TORCH
3029
#include "decoder/torch_asr_model.h"
3130
#endif
31+
#ifdef USE_XPU
32+
#include "xpu/xpu_asr_model.h"
33+
#endif
3234
#include "frontend/feature_pipeline.h"
3335
#include "post_processor/post_processor.h"
3436
#include "utils/flags.h"
3537
#include "utils/string.h"
3638

3739
DEFINE_int32(num_threads, 1, "num threads for ASR model");
40+
DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model");
3841

3942
// TorchAsrModel flags
4043
DEFINE_string(model_path, "", "pytorch exported model path");
4144
// OnnxAsrModel flags
4245
DEFINE_string(onnx_dir, "", "directory where the onnx model is saved");
46+
// XPUAsrModel flags
47+
DEFINE_string(xpu_model_dir, "",
48+
"directory where the XPU model and weights is saved");
4349

4450
// FeaturePipelineConfig flags
4551
DEFINE_int32(num_bins, 80, "num mel bins for fbank feature");
@@ -66,7 +72,8 @@ DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search");
6672
DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search");
6773
DEFINE_double(blank_skip_thresh, 1.0,
6874
"blank skip thresh for ctc wfst search, 1.0 means no skip");
69-
DEFINE_double(length_penalty, 0.0, "length penalty ctc wfst search, will not"
75+
DEFINE_double(length_penalty, 0.0,
76+
"length penalty ctc wfst search, will not"
7077
"apply on self-loop arc, for balancing the del/ins ratio, "
7178
"suggest set to -3.0");
7279
DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search");
@@ -130,7 +137,7 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
130137
#else
131138
LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'.";
132139
#endif
133-
} else {
140+
} else if (!FLAGS_model_path.empty()) {
134141
#ifdef USE_TORCH
135142
LOG(INFO) << "Reading torch model " << FLAGS_model_path;
136143
TorchAsrModel::InitEngineThreads(FLAGS_num_threads);
@@ -140,6 +147,19 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
140147
#else
141148
LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'.";
142149
#endif
150+
} else if (!FLAGS_xpu_model_dir.empty()) {
151+
#ifdef USE_XPU
152+
LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir;
153+
auto model = std::make_shared<XPUAsrModel>();
154+
model->SetEngineThreads(FLAGS_num_threads);
155+
model->SetDeviceId(FLAGS_device_id);
156+
model->Read(FLAGS_xpu_model_dir);
157+
resource->model = model;
158+
#else
159+
LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'.";
160+
#endif
161+
} else {
162+
LOG(FATAL) << "Please set ONNX, TORCH or XPU model path!!!";
143163
}
144164

145165
LOG(INFO) << "Reading unit table " << FLAGS_unit_path;

runtime/kunlun/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
build/
2+
fc_base/

runtime/kunlun/CMakeLists.txt

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
2+
3+
project(wenet VERSION 0.1)
4+
5+
option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF)
6+
option(GRAPH_TOOLS "whether to build TLG graph tools" OFF)
7+
option(BUILD_TESTING "whether to build unit test" OFF)
8+
9+
option(GRPC "whether to build with gRPC" OFF)
10+
# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost
11+
# which is a very big library
12+
option(WEBSOCKET "whether to build with websocket" OFF)
13+
option(XPU "whether to build with XPU" ON)
14+
15+
set(CMAKE_VERBOSE_MAKEFILE OFF)
16+
17+
include(FetchContent)
18+
set(FETCHCONTENT_QUIET OFF)
19+
get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
20+
set(FETCHCONTENT_BASE_DIR ${fc_base})
21+
22+
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
23+
24+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC")
25+
26+
# Include all dependency
27+
include(openfst)
28+
# This CMakeLists.txt is only used for kunlun xpu, so remove the contents
29+
# about onnx, libtorch, gpu and windows.
30+
include(xpu)
31+
# Compile xpu_conformer.a and conformer_test
32+
add_subdirectory(xpu)
33+
34+
include_directories(
35+
${CMAKE_CURRENT_SOURCE_DIR}
36+
${CMAKE_CURRENT_SOURCE_DIR}/kaldi
37+
)
38+
39+
# Build all libraries
40+
add_subdirectory(utils)
41+
if(NOT MSVC)
42+
add_dependencies(utils openfst)
43+
endif()
44+
add_subdirectory(frontend)
45+
add_subdirectory(post_processor)
46+
add_subdirectory(kaldi) # kaldi: wfst based decoder
47+
add_subdirectory(decoder)
48+
add_subdirectory(api)
49+
50+
# Optionally, you can build with websocket
51+
if(WEBSOCKET)
52+
include(boost)
53+
add_subdirectory(websocket)
54+
endif()
55+
56+
# Optionally, you can build with gRPC
57+
if(GRPC)
58+
include(grpc)
59+
add_subdirectory(grpc)
60+
endif()
61+
62+
# Build all bins
63+
add_subdirectory(bin)
64+
65+
# Unit Test
66+
if(BUILD_TESTING)
67+
include(gtest)
68+
add_subdirectory(test)
69+
endif()

runtime/kunlun/README.md

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# 在昆仑芯片上运行Wenet
2+
## 介绍
3+
下面的示例展示了如何在XPU上部署WeNet离线或在线的ASR模型。XPU是一种由昆仑芯100%自主研发的通用人工智能计算核心架构。
4+
5+
## 准备XPU运行环境
6+
7+
在开始之前,请确认您获得以下必须的环境。
8+
9+
XRE(XPU Runtime Environment):昆仑芯片的基础运行环境,包括芯片驱动程序、runtime api库、固件FW工具等功能模块。
10+
XDNN(XPU Deep Neural Network Library):加速深度神经网络的昆仑芯片库,提供应用程序中使用的高性能DNN功能库。
11+
12+
如果您需要任何帮助,或是想要进一步了解昆仑芯片,请通过官方网址联系我们:
13+
https://www.kunlunxin.com.cn/
14+
15+
## 操作步骤
16+
- 第一步:构建,需要cmake 3.14及以上版本
17+
18+
``` sh
19+
export CXX=${your_g++_path}
20+
export CC=${your_gcc_path}
21+
export XPU_API_PATH=${your_api_path}
22+
23+
# -r : release version; -d : debug version
24+
bash ./compile.sh -r
25+
```
26+
27+
- 第二步:测试,测试结果将在控制台输出
28+
29+
``` sh
30+
## set KUNLUN XPU visible device
31+
export XPU_VISIBLE_DEVICES=0
32+
export XPUSIM_DEVICE_MODEL=KUNLUN2
33+
## set logging level
34+
export GLOG_logtostderr=1
35+
export GLOG_v=3
36+
## set speech wav and model/weight path
37+
wav_path=${your_test_wav_path}
38+
xpu_model_dir=${your_xpu_weight_dir}
39+
units=${your_units.txt}
40+
## executive command
41+
./build/bin/decoder_main \
42+
--chunk_size -1 \
43+
--wav_path ${wav_path} \
44+
--xpu_model_dir ${xpu_model_di} \
45+
--unit_path ${units} \
46+
--device_id 0 \
47+
--nbest 3 2>&1 | tee log.txt
48+
```
49+
50+
单条语音执行结果如下所示:
51+
52+
``` sh
53+
XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
54+
I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
55+
I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
56+
I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
57+
I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4
58+
I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6
59+
I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538
60+
I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538
61+
I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1
62+
I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
63+
I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
64+
I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
65+
I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
66+
I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418
67+
I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103
68+
I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512
69+
I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538
70+
I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
71+
I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
72+
I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
73+
I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3
74+
I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3
75+
I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3
76+
I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14
77+
I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
78+
I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
79+
I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
80+
I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
81+
test 甚至出现交易几乎停滞的情况
82+
I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
83+
```

runtime/kunlun/README_EN.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# WeNet running on KUNLUNXIN XPU device
2+
## Introduction
3+
The below example shows how to deploy WeNet offline and online ASR models on XPUs.
4+
XPU is a core architecture 100% independently developed by KUNLUNXIN for general artificial intelligence computing.
5+
6+
## Setup environment for XPU device
7+
8+
Before the start, makesure you have these necessary environment
9+
10+
XRE(XPU Runtime Environment):The basic operating environment of the XPUs
11+
includes functional modules such as chip drivers, runtime api library, and firmware tools.
12+
13+
XDNN(XPU Deep Neural Network Library): XPU library for accelerating deep neural networks, providing high-performance DNN function library used in applications.
14+
15+
If you would like to know more about XPUs or need any help, please contact us through the official website:
16+
17+
https://www.kunlunxin.com.cn/
18+
19+
## Instruction
20+
- Step 1. Build, the build requires cmake 3.14 or above.
21+
22+
``` sh
23+
export CXX=${your_g++_path}
24+
export CC=${your_gcc_path}
25+
export XPU_API_PATH=${your_api_path}
26+
27+
# -r : release version; -d : debug version
28+
bash ./compile.sh -r
29+
```
30+
31+
- Step 2. Testing, the result is shown in the console.
32+
33+
``` sh
34+
## set KUNLUN XPU visible device
35+
export XPU_VISIBLE_DEVICES=0
36+
export XPUSIM_DEVICE_MODEL=KUNLUN2
37+
## set logging level
38+
export GLOG_logtostderr=1
39+
export GLOG_v=3
40+
## set speech wav and model/weight/units path
41+
wav_path=${your_test_wav_path}
42+
xpu_model_dir=${your_xpu_weight_dir}
43+
units=${your_units.txt}
44+
## executive command
45+
./build/bin/decoder_main \
46+
--chunk_size -1 \
47+
--wav_path $wav_path \
48+
--xpu_model_dir $xpu_model_dir \
49+
--unit_path $units \
50+
--device_id 0 \
51+
--nbest 3 2>&1 | tee log.txt
52+
```
53+
54+
A typical output result is as following:
55+
56+
``` sh
57+
XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
58+
I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
59+
I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
60+
I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
61+
I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4
62+
I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6
63+
I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538
64+
I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538
65+
I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1
66+
I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
67+
I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
68+
I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
69+
I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
70+
I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418
71+
I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103
72+
I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512
73+
I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538
74+
I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
75+
I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
76+
I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
77+
I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3
78+
I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3
79+
I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3
80+
I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14
81+
I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
82+
I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
83+
I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
84+
I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
85+
test 甚至出现交易几乎停滞的情况
86+
I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
87+
```

runtime/kunlun/api

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../core/api

runtime/kunlun/bin

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../core/bin

runtime/kunlun/cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../core/cmake

0 commit comments

Comments
 (0)