|
| 1 | +#include <iostream> |
| 2 | + |
| 3 | +#include <opencv2/dnn.hpp> |
| 4 | +#include <opencv2/imgproc.hpp> |
| 5 | +#include <opencv2/highgui.hpp> |
| 6 | + |
| 7 | +using namespace std; |
| 8 | +using namespace cv; |
| 9 | +using namespace dnn; |
| 10 | + |
| 11 | +vector< pair<cv::dnn::Backend, cv::dnn::Target> > backendTargetPairs = { |
| 12 | + std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU), |
| 13 | + std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA), |
| 14 | + std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16), |
| 15 | + std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU), |
| 16 | + std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)}; |
| 17 | + |
| 18 | + |
| 19 | +std::string keys = |
| 20 | +"{ help h | | Print help message. }" |
| 21 | +"{ model m | text_detection_DB_IC15_resnet18_2021sep.onnx | Usage: Set model type, defaults to text_detection_DB_IC15_resnet18_2021sep.onnx }" |
| 22 | +"{ input i | | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}" |
| 23 | +"{ width | 736 | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}" |
| 24 | +"{ height | 736 | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}" |
| 25 | +"{ binary_threshold | 0.3 | Usage: Threshold of the binary map, default = 0.3.}" |
| 26 | +"{ polygon_threshold | 0.5 | Usage: Threshold of polygons, default = 0.5.}" |
| 27 | +"{ max_candidates | 200 | Usage: Set maximum number of polygon candidates, default = 200.}" |
| 28 | +"{ unclip_ratio | 2.0 | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}" |
| 29 | +"{ save s | true | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}" |
| 30 | +"{ viz v | true | Usage: Specify to open a new window to show results. Invalid in case of camera input.}" |
| 31 | +"{ backend bt | 0 | Choose one of computation backends: " |
| 32 | +"0: (default) OpenCV implementation + CPU, " |
| 33 | +"1: CUDA + GPU (CUDA), " |
| 34 | +"2: CUDA + GPU (CUDA FP16), " |
| 35 | +"3: TIM-VX + NPU, " |
| 36 | +"4: CANN + NPU}"; |
| 37 | + |
| 38 | + |
| 39 | +class DB { |
| 40 | +public: |
| 41 | + |
| 42 | + DB(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3, |
| 43 | + float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0, |
| 44 | + dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh), |
| 45 | + polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio), |
| 46 | + backendId(bId), targetId(tId) |
| 47 | + { |
| 48 | + this->model = TextDetectionModel_DB(readNet(modelPath)); |
| 49 | + this->model.setPreferableBackend(backendId); |
| 50 | + this->model.setPreferableTarget(targetId); |
| 51 | + |
| 52 | + this->model.setBinaryThreshold(binaryThreshold); |
| 53 | + this->model.setPolygonThreshold(polygonThreshold); |
| 54 | + this->model.setUnclipRatio(unclipRatio); |
| 55 | + this->model.setMaxCandidates(maxCandidates); |
| 56 | + |
| 57 | + this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793)); |
| 58 | + } |
| 59 | + pair< vector<vector<Point>>, vector<float> > infer(Mat image) { |
| 60 | + CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size "); |
| 61 | + CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size "); |
| 62 | + vector<vector<Point>> pt; |
| 63 | + vector<float> confidence; |
| 64 | + this->model.detect(image, pt, confidence); |
| 65 | + return make_pair< vector<vector<Point>> &, vector< float > &>(pt, confidence); |
| 66 | + } |
| 67 | + |
| 68 | +private: |
| 69 | + string modelPath; |
| 70 | + TextDetectionModel_DB model; |
| 71 | + Size inputSize; |
| 72 | + float binaryThreshold; |
| 73 | + float polygonThreshold; |
| 74 | + int maxCandidates; |
| 75 | + double unclipRatio; |
| 76 | + dnn::Backend backendId; |
| 77 | + dnn::Target targetId; |
| 78 | + |
| 79 | +}; |
| 80 | + |
| 81 | +Mat visualize(Mat image, pair< vector<vector<Point>>, vector<float> >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2) |
| 82 | +{ |
| 83 | + Mat output; |
| 84 | + image.copyTo(output); |
| 85 | + if (fps > 0) |
| 86 | + putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor); |
| 87 | + polylines(output, results.first, isClosed, boxColor, thickness); |
| 88 | + return output; |
| 89 | +} |
| 90 | + |
| 91 | +int main(int argc, char** argv) |
| 92 | +{ |
| 93 | + CommandLineParser parser(argc, argv, keys); |
| 94 | + |
| 95 | + parser.about("Use this program to run Real-time Scene Text Detection with Differentiable Binarization in opencv Zoo using OpenCV."); |
| 96 | + if (parser.has("help")) |
| 97 | + { |
| 98 | + parser.printMessage(); |
| 99 | + return 0; |
| 100 | + } |
| 101 | + |
| 102 | + int backendTargetid = parser.get<int>("backend"); |
| 103 | + String modelName = parser.get<String>("model"); |
| 104 | + |
| 105 | + if (modelName.empty()) |
| 106 | + { |
| 107 | + CV_Error(Error::StsError, "Model file " + modelName + " not found"); |
| 108 | + } |
| 109 | + |
| 110 | + Size inpSize(parser.get<int>("width"), parser.get<int>("height")); |
| 111 | + float binThresh = parser.get<float>("binary_threshold"); |
| 112 | + float polyThresh = parser.get<float>("polygon_threshold"); |
| 113 | + int maxCand = parser.get<int>("max_candidates"); |
| 114 | + double unRatio = parser.get<float>("unclip_ratio"); |
| 115 | + bool save = parser.get<bool>("save"); |
| 116 | + bool viz = parser.get<float>("viz"); |
| 117 | + |
| 118 | + DB model(modelName, inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second); |
| 119 | + |
| 120 | + //! [Open a video file or an image file or a camera stream] |
| 121 | + VideoCapture cap; |
| 122 | + if (parser.has("input")) |
| 123 | + cap.open(parser.get<String>("input")); |
| 124 | + else |
| 125 | + cap.open(0); |
| 126 | + if (!cap.isOpened()) |
| 127 | + CV_Error(Error::StsError, "Cannot opend video or file"); |
| 128 | + Mat originalImage; |
| 129 | + static const std::string kWinName = modelName; |
| 130 | + while (waitKey(1) < 0) |
| 131 | + { |
| 132 | + cap >> originalImage; |
| 133 | + if (originalImage.empty()) |
| 134 | + { |
| 135 | + cout << "Frame is empty" << endl; |
| 136 | + waitKey(); |
| 137 | + break; |
| 138 | + } |
| 139 | + int originalW = originalImage.cols; |
| 140 | + int originalH = originalImage.rows; |
| 141 | + double scaleHeight = originalH / double(inpSize.height); |
| 142 | + double scaleWidth = originalW / double(inpSize.width); |
| 143 | + Mat image; |
| 144 | + resize(originalImage, image, inpSize); |
| 145 | + |
| 146 | + // inference |
| 147 | + TickMeter tm; |
| 148 | + tm.start(); |
| 149 | + pair< vector<vector<Point>>, vector<float> > results = model.infer(image); |
| 150 | + tm.stop(); |
| 151 | + auto x = results.first; |
| 152 | + // Scale the results bounding box |
| 153 | + for (auto &pts : results.first) |
| 154 | + { |
| 155 | + for (int i = 0; i < 4; i++) |
| 156 | + { |
| 157 | + pts[i].x = int(pts[i].x * scaleWidth); |
| 158 | + pts[i].y = int(pts[i].y * scaleHeight); |
| 159 | + } |
| 160 | + } |
| 161 | + originalImage = visualize(originalImage, results, tm.getFPS()); |
| 162 | + tm.reset(); |
| 163 | + if (parser.has("input")) |
| 164 | + { |
| 165 | + if (save) |
| 166 | + { |
| 167 | + cout << "Result image saved to result.jpg\n"; |
| 168 | + imwrite("result.jpg", originalImage); |
| 169 | + } |
| 170 | + if (viz) |
| 171 | + imshow(kWinName, originalImage); |
| 172 | + } |
| 173 | + else |
| 174 | + imshow(kWinName, originalImage); |
| 175 | + } |
| 176 | + return 0; |
| 177 | +} |
| 178 | + |
| 179 | + |
0 commit comments