Skip to content

Commit 7c3edcf

Browse files
Text Detection model DB (#175)
* Text Detection model DB * review 1
1 parent 4913c64 commit 7c3edcf

File tree

3 files changed

+226
-0
lines changed

3 files changed

+226
-0
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
cmake_minimum_required(VERSION 3.24)
2+
set(project_name "opencv_zoo_text_detection_db")
3+
4+
PROJECT (${project_name})
5+
6+
set(OPENCV_VERSION "4.7.0")
7+
set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
8+
find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH})
9+
# Find OpenCV, you may need to set OpenCV_DIR variable
10+
# to the absolute path to the directory containing OpenCVConfig.cmake file
11+
# via the command line or GUI
12+
13+
file(GLOB SourceFile
14+
"demo.cpp")
15+
# If the package has been found, several variables will
16+
# be set, you can find the full list with descriptions
17+
# in the OpenCVConfig.cmake file.
18+
# Print some message showing some of them
19+
message(STATUS "OpenCV library status:")
20+
message(STATUS " config: ${OpenCV_DIR}")
21+
message(STATUS " version: ${OpenCV_VERSION}")
22+
message(STATUS " libraries: ${OpenCV_LIBS}")
23+
message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}")
24+
25+
# Declare the executable target built from your sources
26+
add_executable(${project_name} ${SourceFile})
27+
28+
# Link your application with OpenCV libraries
29+
target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})

models/text_detection_db/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Note:
1111

1212
## Demo
1313

14+
### Python
15+
1416
Run the following command to try the demo:
1517

1618
```shell
@@ -23,6 +25,22 @@ python demo.py --input /path/to/image -v
2325
python demo.py --help
2426
```
2527

28+
### C++
29+
30+
Install latest OpenCV and CMake >= 3.24.0 to get started with:
31+
32+
```shell
33+
# A typical and default installation path of OpenCV is /usr/local
34+
cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation .
35+
cmake --build build
36+
# detect on camera input
37+
./build/opencv_zoo_text_detection_db -m=/path/to/model
38+
# detect on an image
39+
./build/opencv_zoo_text_detection_db -m=/path/to/model -i=/path/to/image -v
40+
# get help messages
41+
./build/opencv_zoo_text_detection_db -h
42+
```
43+
2644
### Example outputs
2745

2846
![mask](./example_outputs/mask.jpg)

models/text_detection_db/demo.cpp

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#include <iostream>
2+
3+
#include <opencv2/dnn.hpp>
4+
#include <opencv2/imgproc.hpp>
5+
#include <opencv2/highgui.hpp>
6+
7+
using namespace std;
8+
using namespace cv;
9+
using namespace dnn;
10+
11+
vector< pair<cv::dnn::Backend, cv::dnn::Target> > backendTargetPairs = {
12+
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU),
13+
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA),
14+
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16),
15+
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU),
16+
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)};
17+
18+
19+
std::string keys =
20+
"{ help h | | Print help message. }"
21+
"{ model m | text_detection_DB_IC15_resnet18_2021sep.onnx | Usage: Set model type, defaults to text_detection_DB_IC15_resnet18_2021sep.onnx }"
22+
"{ input i | | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}"
23+
"{ width | 736 | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}"
24+
"{ height | 736 | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}"
25+
"{ binary_threshold | 0.3 | Usage: Threshold of the binary map, default = 0.3.}"
26+
"{ polygon_threshold | 0.5 | Usage: Threshold of polygons, default = 0.5.}"
27+
"{ max_candidates | 200 | Usage: Set maximum number of polygon candidates, default = 200.}"
28+
"{ unclip_ratio | 2.0 | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}"
29+
"{ save s | true | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}"
30+
"{ viz v | true | Usage: Specify to open a new window to show results. Invalid in case of camera input.}"
31+
"{ backend bt | 0 | Choose one of computation backends: "
32+
"0: (default) OpenCV implementation + CPU, "
33+
"1: CUDA + GPU (CUDA), "
34+
"2: CUDA + GPU (CUDA FP16), "
35+
"3: TIM-VX + NPU, "
36+
"4: CANN + NPU}";
37+
38+
39+
class DB {
40+
public:
41+
42+
DB(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3,
43+
float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0,
44+
dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh),
45+
polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio),
46+
backendId(bId), targetId(tId)
47+
{
48+
this->model = TextDetectionModel_DB(readNet(modelPath));
49+
this->model.setPreferableBackend(backendId);
50+
this->model.setPreferableTarget(targetId);
51+
52+
this->model.setBinaryThreshold(binaryThreshold);
53+
this->model.setPolygonThreshold(polygonThreshold);
54+
this->model.setUnclipRatio(unclipRatio);
55+
this->model.setMaxCandidates(maxCandidates);
56+
57+
this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793));
58+
}
59+
pair< vector<vector<Point>>, vector<float> > infer(Mat image) {
60+
CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size ");
61+
CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size ");
62+
vector<vector<Point>> pt;
63+
vector<float> confidence;
64+
this->model.detect(image, pt, confidence);
65+
return make_pair< vector<vector<Point>> &, vector< float > &>(pt, confidence);
66+
}
67+
68+
private:
69+
string modelPath;
70+
TextDetectionModel_DB model;
71+
Size inputSize;
72+
float binaryThreshold;
73+
float polygonThreshold;
74+
int maxCandidates;
75+
double unclipRatio;
76+
dnn::Backend backendId;
77+
dnn::Target targetId;
78+
79+
};
80+
81+
Mat visualize(Mat image, pair< vector<vector<Point>>, vector<float> >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2)
82+
{
83+
Mat output;
84+
image.copyTo(output);
85+
if (fps > 0)
86+
putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor);
87+
polylines(output, results.first, isClosed, boxColor, thickness);
88+
return output;
89+
}
90+
91+
int main(int argc, char** argv)
92+
{
93+
CommandLineParser parser(argc, argv, keys);
94+
95+
parser.about("Use this program to run Real-time Scene Text Detection with Differentiable Binarization in opencv Zoo using OpenCV.");
96+
if (parser.has("help"))
97+
{
98+
parser.printMessage();
99+
return 0;
100+
}
101+
102+
int backendTargetid = parser.get<int>("backend");
103+
String modelName = parser.get<String>("model");
104+
105+
if (modelName.empty())
106+
{
107+
CV_Error(Error::StsError, "Model file " + modelName + " not found");
108+
}
109+
110+
Size inpSize(parser.get<int>("width"), parser.get<int>("height"));
111+
float binThresh = parser.get<float>("binary_threshold");
112+
float polyThresh = parser.get<float>("polygon_threshold");
113+
int maxCand = parser.get<int>("max_candidates");
114+
double unRatio = parser.get<float>("unclip_ratio");
115+
bool save = parser.get<bool>("save");
116+
bool viz = parser.get<float>("viz");
117+
118+
DB model(modelName, inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
119+
120+
//! [Open a video file or an image file or a camera stream]
121+
VideoCapture cap;
122+
if (parser.has("input"))
123+
cap.open(parser.get<String>("input"));
124+
else
125+
cap.open(0);
126+
if (!cap.isOpened())
127+
CV_Error(Error::StsError, "Cannot opend video or file");
128+
Mat originalImage;
129+
static const std::string kWinName = modelName;
130+
while (waitKey(1) < 0)
131+
{
132+
cap >> originalImage;
133+
if (originalImage.empty())
134+
{
135+
cout << "Frame is empty" << endl;
136+
waitKey();
137+
break;
138+
}
139+
int originalW = originalImage.cols;
140+
int originalH = originalImage.rows;
141+
double scaleHeight = originalH / double(inpSize.height);
142+
double scaleWidth = originalW / double(inpSize.width);
143+
Mat image;
144+
resize(originalImage, image, inpSize);
145+
146+
// inference
147+
TickMeter tm;
148+
tm.start();
149+
pair< vector<vector<Point>>, vector<float> > results = model.infer(image);
150+
tm.stop();
151+
auto x = results.first;
152+
// Scale the results bounding box
153+
for (auto &pts : results.first)
154+
{
155+
for (int i = 0; i < 4; i++)
156+
{
157+
pts[i].x = int(pts[i].x * scaleWidth);
158+
pts[i].y = int(pts[i].y * scaleHeight);
159+
}
160+
}
161+
originalImage = visualize(originalImage, results, tm.getFPS());
162+
tm.reset();
163+
if (parser.has("input"))
164+
{
165+
if (save)
166+
{
167+
cout << "Result image saved to result.jpg\n";
168+
imwrite("result.jpg", originalImage);
169+
}
170+
if (viz)
171+
imshow(kWinName, originalImage);
172+
}
173+
else
174+
imshow(kWinName, originalImage);
175+
}
176+
return 0;
177+
}
178+
179+

0 commit comments

Comments
 (0)