123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- #ifndef YOLO_HPP__
- #define YOLO_HPP__
- #include "src/common/data.hpp"
- #include "infer/infer.hpp"
- #include "opencv2/opencv.hpp"
- #include "opencv2/dnn.hpp"
- #include <memory>
- #include <string>
- #include <vector>
- #include <iostream>
- namespace yolo
- {
- class Yolov5InferImpl : public Infer
- {
- public:
- ModelType model_type;
- std::shared_ptr<cv::dnn::Net> net_;
- float confidence_threshold_;
- float nms_threshold_;
- int network_input_width_, network_input_height_;
- std::vector<std::string> names_;
- bool load(const std::string& model_path, const std::vector<std::string>& names, int gpu_id=0, float confidence_threshold=0.5f, float nms_threshold=0.45f)
- {
- net_ = std::make_shared<cv::dnn::Net>(cv::dnn::readNet(model_path));
- // 获取模型输入层名称
- std::vector<std::string> inputNames = net->getLayerNames();
-
- // 获取输入层的形状信息
- std::vector<std::vector<int>> inShapes, outShapes;
- net.getLayerShapes(cv::dnn::Dict(), 0, inShapes, outShapes);
- if (!inShapes.empty()) {
- int batchSize = inShapes[0][0]; // 批次大小(通常为1)
- int channels = inShapes[0][1]; // 通道数
- network_input_height_ = inShapes[0][2]; // 高度
- network_input_width_ = inShapes[0][3]; // 宽度
- std::cout << "Model Input Shape: " << batchSize << "x"
- << channels << "x" << network_input_height_ << "x" << network_input_width_ << std::endl;
- } else {
- std::cout << "Failed to get input shape!" << std::endl;
- }
- return true;
- }
- void warpAffine(cv::Mat& src_image, cv::Mat& dst_image, float *d2i)
- {
- int src_image_width = src_image.cols;
- int src_image_height = src_image.rows;
- float scale_x = network_input_width_ / (float)src_image_width;
- float scale_y = network_input_height_ / (float)src_image_height;
- float scale = std::min(scale_x, scale_y);
- float i2d[6];
- i2d[0] = scale;
- i2d[1] = 0;
- i2d[2] = (-scale * src_image_width + network_input_width_ + scale - 1) * 0.5;
- i2d[3] = 0;
- i2d[4] = scale;
- i2d[5] = (-scale * src_image_height + network_input_height_ + scale - 1) * 0.5;
- cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
- cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
- cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);
- dst_image.create(network_input_height_, network_input_width_, CV_8UC3);
- cv::warpAffine(src_image, dst_image, m2x3_i2d, dst_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));
- }
- void decode(std::vector<cv::Mat>& outs,
- data::BoxArray& result_boxes,
- float *d2i,
- int src_image_width,
- int src_image_height)
- {
- data::BoxArray boxes;
- int cols = outs[0].size[2];
- int rows = outs[0].size[1];
- float* predict = (float*)outs[0].data;
- int num_classes = cols - 5;
-
- for(int i = 0; i < rows; ++i)
- {
- float* pitem = predict + i * cols;
- float objness = pitem[4];
- if (objness < confidence_threshold_)
- {
- continue;
- }
-
- float* pclass = pitem + 5;
- int label = std::max_element(pclass, pclass + num_classes) - pclass;
- float prob = pclass[label];
- float confidence = prob * objness;
- if(confidence < confidence_threshold_)
- {
- continue;
- }
- float cx = pitem[0];
- float cy = pitem[1];
- float width = pitem[2];
- float height = pitem[3];
-
- // 通过反变换恢复到图像尺度
- float left = (cx - width * 0.5) * d2i[0] + d2i[2];
- float top = (cy - height * 0.5) * d2i[0] + d2i[5];
- float right = (cx + width * 0.5) * d2i[0] + d2i[2];
- float bottom = (cy + height * 0.5) * d2i[0] + d2i[5];
- boxes.emplace_back(left, top, right, bottom, confidence, names_[label]);
- }
- std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){return a.confidence > b.confidence;});
- std::vector<bool> remove_flags(boxes.size());
- result_boxes.reserve(boxes.size());
-
- auto iou = [](const Box& a, const Box& b){
- int cross_left = std::max(a.left, b.left);
- int cross_top = std::max(a.top, b.top);
- int cross_right = std::min(a.right, b.right);
- int cross_bottom = std::min(a.bottom, b.bottom);
-
- int cross_area = std::max(0, cross_right - cross_left) * std::max(0, cross_bottom - cross_top);
- int union_area = std::max(0.f, a.right - a.left) * std::max(0.f, a.bottom - a.top)
- + std::max(0.f, b.right - b.left) * std::max(0.f, b.bottom - b.top) - cross_area;
- if(cross_area == 0 || union_area == 0) return 0.0f;
- return 1.0f * cross_area / union_area;
- };
-
- for(int i = 0; i < boxes.size(); ++i)
- {
- if(remove_flags[i]) continue;
-
- auto& ibox = boxes[i];
- result_boxes.emplace_back(ibox);
- for (int j = i + 1; j < boxes.size(); ++j)
- {
- if (remove_flags[j]) continue;
-
- auto& jbox = boxes[j];
- if (ibox.class_id == jbox.class_id)
- {
- // class matched
- if (iou(ibox, jbox) >= nms_threshold_)
- remove_flags[j] = true;
- }
- }
- }
-
- }
- virtual data::BoxArray forward(cv::Mat& image) override
- {
- float d2i[6];
- cv::Mat affine_image;
- warpAffine(image, affine_image, d2i);
- std::vector<cv::Mat> outs;
- auto blob = cv::dnn::blobFromImage(affine_image, 1 / 255.0, cv::Size(network_input_height_, network_input_width_), cv::Scalar(0, 0, 0), true, false);
- net_->setInput(blob);
- net_->forward(outs, net_.getUnconnectedOutLayersNames());
- data::BoxArray result;
- decode(outs, result, d2i, image.cols, image.rows);
- return result;
- }
- };
- }
- #endif // YOLO_HPP__
|