leon 2 달 전
부모
커밋
1781d2cfcf

+ 2 - 1
.vscode/settings.json

@@ -55,6 +55,7 @@
         "algorithm": "cpp",
         "tuple": "cpp",
         "future": "cpp",
-        "iomanip": "cpp"
+        "iomanip": "cpp",
+        "fstream": "cpp"
     }
 }

+ 1 - 1
Makefile

@@ -41,7 +41,7 @@ link_trt          := nvinfer nvinfer_plugin nvonnxparser
 link_cuda         := cuda cublas cudart cudnn
 link_sys          := stdc++ dl
 
-link_librarys     := $(link_opencv) $(link_cuda) $(link_sys)
+link_librarys     := $(link_opencv) $(link_trt) $(link_cuda) $(link_sys)
 
 
 empty := 

+ 124 - 0
src/common/check.hpp

@@ -0,0 +1,124 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+ #ifndef __CHECK_HPP__
+ #define __CHECK_HPP__
+ 
+ #include <assert.h>
+ #include <cuda_runtime.h>
+ #include <stdarg.h>
+ #include <stdio.h>
+ 
+ #include <string>
+ 
+ namespace nv
+ {
+ 
+ #define NVUNUSED2(a, b) \
+     {                   \
+         (void)(a);      \
+         (void)(b);      \
+     }
+ #define NVUNUSED(a) \
+     {               \
+         (void)(a);  \
+     }
+ 
+ #if DEBUG
+ #define checkRuntime(call) nv::check_runtime(call, #call, __LINE__, __FILE__)
+ #define checkKernel(...)                                                                \
+     [&]                                                                                 \
+     {                                                                                   \
+         __VA_ARGS__;                                                                    \
+         checkRuntime(cudaStreamSynchronize(nullptr));                                   \
+         return nv::check_runtime(cudaGetLastError(), #__VA_ARGS__, __LINE__, __FILE__); \
+     }()
+ #define dprintf printf
+ #else
+ #define checkRuntime(call) nv::check_runtime(call, #call, __LINE__, __FILE__)
+ #define checkKernel(...)                                                            \
+     do                                                                              \
+     {                                                                               \
+         __VA_ARGS__;                                                                \
+         nv::check_runtime(cudaPeekAtLastError(), #__VA_ARGS__, __LINE__, __FILE__); \
+     } while (false)
+ #define dprintf(...)
+ #endif
+ 
+ #define Assertf(cond, fmt, ...)                                               \
+     do                                                                        \
+     {                                                                         \
+         if (!(cond))                                                          \
+         {                                                                     \
+             fprintf(stderr,                                                   \
+                     "Assert failed 💀. %s in file %s:%d, message: " fmt "\n", \
+                     #cond,                                                    \
+                     __FILE__,                                                 \
+                     __LINE__,                                                 \
+                     __VA_ARGS__);                                             \
+             abort();                                                          \
+         }                                                                     \
+     } while (false)
+ 
+ #define Asserts(cond, s)                                                                                        \
+     do                                                                                                          \
+     {                                                                                                           \
+         if (!(cond))                                                                                            \
+         {                                                                                                       \
+             fprintf(stderr, "Assert failed 💀. %s in file %s:%d, message: " s "\n", #cond, __FILE__, __LINE__); \
+             abort();                                                                                            \
+         }                                                                                                       \
+     } while (false)
+ 
+ #define Assert(cond)                                                                            \
+     do                                                                                          \
+     {                                                                                           \
+         if (!(cond))                                                                            \
+         {                                                                                       \
+             fprintf(stderr, "Assert failed 💀. %s in file %s:%d\n", #cond, __FILE__, __LINE__); \
+             abort();                                                                            \
+         }                                                                                       \
+     } while (false)
+ 
+ static inline bool check_runtime(cudaError_t e, const char *call, int line, const char *file)
+ {
+     if (e != cudaSuccess)
+     {
+         fprintf(stderr,
+                 "CUDA Runtime error %s # %s, code = %s [ %d ] in file "
+                 "%s:%d\n",
+                 call,
+                 cudaGetErrorString(e),
+                 cudaGetErrorName(e),
+                 e,
+                 file,
+                 line);
+         abort();
+         return false;
+     }
+     return true;
+ }
+ 
+ }; // namespace nv
+ 
+ #endif // __CHECK_HPP__

+ 10 - 10
src/common/data.hpp

@@ -32,18 +32,17 @@ struct Point
 
 struct Box
 {
-    float left;
-    float top;
-    float right;
-    float bottom;
-    float score;
+    float left, top, right, bottom, score;
+    int class_id;
     std::string label;
     std::vector<Point> points;
-    Box() : left(0), top(0), right(0), bottom(0), score(0), label("") {}
-    Box(float left, float top, float right, float bottom, float score, const std::string& label) 
-        : left(left), top(top), right(right), bottom(bottom), score(score), label(label) {}
-    Box(const Box& b) : left(b.left), top(b.top), right(b.right), bottom(b.bottom), score(b.score), label(b.label), points(b.points) {}
-    Box(const Box&& b) : left(b.left), top(b.top), right(b.right), bottom(b.bottom), score(b.score), label(b.label), points(b.points) {}
+    Box() : left(0), top(0), right(0), bottom(0), score(0), class_id(0), label("") {}
+    Box(float left, float top, float right, float bottom, float score, int class_id) 
+        : left(left), top(top), right(right), bottom(bottom), score(score), class_id(class_id), label("") {}
+    Box(float left, float top, float right, float bottom, float score, int class_id,  const std::string& label) 
+        : left(left), top(top), right(right), bottom(bottom), score(score), class_id(class_id), label(label) {}
+    Box(const Box& b) : left(b.left), top(b.top), right(b.right), bottom(b.bottom), score(b.score), class_id(b.class_id), label(b.label), points(b.points) {}
+    Box(const Box&& b) : left(b.left), top(b.top), right(b.right), bottom(b.bottom), score(b.score), class_id(b.class_id), label(b.label), points(b.points) {}
     Box& operator=(const Box& b)
     {
         left = b.left;
@@ -51,6 +50,7 @@ struct Box
         right = b.right;
         bottom = b.bottom;
         score = b.score;
+        class_id = b.class_id;
         label = b.label;
         points = b.points;
         return *this;

+ 8 - 0
src/common/image.cpp

@@ -0,0 +1,8 @@
+#include "common/image.hpp"
+
+namespace tensor
+{
+
+tensor::Image cvimg(const cv::Mat &image) { return Image(image.data, image.cols, image.rows); }
+
+} // namespace tensor

+ 21 - 0
src/common/image.hpp

@@ -0,0 +1,21 @@
+#ifndef __IMAGE_HPP__
+#define __IMAGE_HPP__
+#include "opencv2/opencv.hpp"
+
+namespace tensor
+{
+
+struct Image
+{
+    const void *bgrptr = nullptr;
+    int width = 0, height = 0;
+
+    Image() = default;
+    Image(const void *bgrptr, int width, int height) : bgrptr(bgrptr), width(width), height(height) {}
+};
+
+Image cvimg(const cv::Mat &image);
+
+} // namespace tensor
+
+#endif

+ 112 - 0
src/common/memory.cu

@@ -0,0 +1,112 @@
+#include "common/check.hpp"
+#include "common/memory.hpp"
+#include <cuda_runtime.h>
+
+namespace tensor
+{
+
+using namespace std;
+
+static size_t upbound(size_t n, size_t align) { return (n + align - 1) / align * align; }
+
+BaseMemory::BaseMemory(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes)
+{
+    reference(cpu, cpu_bytes, gpu, gpu_bytes);
+}
+
+void BaseMemory::reference(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes)
+{
+    release();
+
+    if (cpu == nullptr || cpu_bytes == 0)
+    {
+        cpu       = nullptr;
+        cpu_bytes = 0;
+    }
+
+    if (gpu == nullptr || gpu_bytes == 0)
+    {
+        gpu       = nullptr;
+        gpu_bytes = 0;
+    }
+
+    this->cpu_          = cpu;
+    this->cpu_capacity_ = cpu_bytes;
+    this->cpu_bytes_    = cpu_bytes;
+    this->gpu_          = gpu;
+    this->gpu_capacity_ = gpu_bytes;
+    this->gpu_bytes_    = gpu_bytes;
+
+    this->owner_cpu_ = !(cpu && cpu_bytes > 0);
+    this->owner_gpu_ = !(gpu && gpu_bytes > 0);
+}
+
+BaseMemory::~BaseMemory() { release(); }
+
+void *BaseMemory::gpu_realloc(size_t bytes)
+{
+    // 内存对齐
+    size_t size = upbound(bytes, 32);
+    if (gpu_capacity_ < size)
+    {
+        release_gpu();
+
+        gpu_capacity_ = size;
+        checkRuntime(cudaMalloc(&gpu_, size));
+        // checkRuntime(cudaMemset(gpu_, 0, size));
+    }
+    gpu_bytes_ = bytes;
+    return gpu_;
+}
+
+void *BaseMemory::cpu_realloc(size_t bytes)
+{
+    size_t size = upbound(bytes, 32);
+    if (cpu_capacity_ < size)
+    {
+        release_cpu();
+
+        cpu_capacity_ = size;
+        checkRuntime(cudaMallocHost(&cpu_, size));
+        Assert(cpu_ != nullptr);
+        // memset(cpu_, 0, size);
+    }
+    cpu_bytes_ = bytes;
+    return cpu_;
+}
+
+void BaseMemory::release_cpu()
+{
+    if (cpu_)
+    {
+        if (owner_cpu_)
+        {
+            checkRuntime(cudaFreeHost(cpu_));
+        }
+        cpu_ = nullptr;
+    }
+    cpu_capacity_ = 0;
+    cpu_bytes_    = 0;
+}
+
+void BaseMemory::release_gpu()
+{
+    if (gpu_)
+    {
+        if (owner_gpu_)
+        {
+            checkRuntime(cudaFree(gpu_));
+        }
+        gpu_ = nullptr;
+    }
+    gpu_capacity_ = 0;
+    gpu_bytes_    = 0;
+}
+
+void BaseMemory::release()
+{
+    release_cpu();
+    release_gpu();
+}
+
+} // namespace tensor

+ 61 - 0
src/common/memory.hpp

@@ -0,0 +1,61 @@
+#ifndef __MEMORY_HPP__
+#define __MEMORY_HPP__
+
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace tensor
+{
+
+class BaseMemory
+{
+  public:
+    BaseMemory() = default;
+    BaseMemory(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes);
+    virtual ~BaseMemory();
+    virtual void *gpu_realloc(size_t bytes);
+    virtual void *cpu_realloc(size_t bytes);
+    void release_gpu();
+    void release_cpu();
+    void release();
+    inline bool owner_gpu() const { return owner_gpu_; }
+    inline bool owner_cpu() const { return owner_cpu_; }
+    inline size_t cpu_bytes() const { return cpu_bytes_; }
+    inline size_t gpu_bytes() const { return gpu_bytes_; }
+    virtual inline void *get_gpu() const { return gpu_; }
+    virtual inline void *get_cpu() const { return cpu_; }
+    void reference(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes);
+
+  protected:
+    void *cpu_           = nullptr;
+    size_t cpu_bytes_    = 0;
+    size_t cpu_capacity_ = 0;
+    bool owner_cpu_      = true;
+
+    void *gpu_           = nullptr;
+    size_t gpu_bytes_    = 0;
+    size_t gpu_capacity_ = 0;
+    bool owner_gpu_      = true;
+};
+
+template <typename _DT> class Memory : public BaseMemory
+{
+  public:
+    Memory()                               = default;
+    Memory(const Memory &other)            = delete;
+    Memory &operator=(const Memory &other) = delete;
+    virtual _DT *gpu(size_t size) { return (_DT *)BaseMemory::gpu_realloc(size * sizeof(_DT)); }
+    virtual _DT *cpu(size_t size) { return (_DT *)BaseMemory::cpu_realloc(size * sizeof(_DT)); }
+
+    inline size_t cpu_size() const { return cpu_bytes_ / sizeof(_DT); }
+    inline size_t gpu_size() const { return gpu_bytes_ / sizeof(_DT); }
+
+    virtual inline _DT *gpu() const { return (_DT *)gpu_; }
+    virtual inline _DT *cpu() const { return (_DT *)cpu_; }
+};
+
+} // namespace tensor
+
+#endif

+ 6 - 5
src/common/meta.hpp

@@ -9,11 +9,12 @@ namespace meta
 {
 
 struct MetaData{
-    std::string from;
-    cv::Mat image;
-    cv::Mat draw_image;
-    data::BoxArray boxes;
-    data::BoxArray result;
+    std::string from; // 图片来源
+    cv::Mat image; // 原始图片
+    cv::Mat draw_image; // 画框图
+    cv::Mat depth; // 深度图
+    data::BoxArray boxes; // 目标检测识别结果
+    data::BoxArray result; // 分析结果
 };
 
 }

+ 357 - 0
src/common/tensorrt.cpp

@@ -0,0 +1,357 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+ #include "tensorrt.hpp"
+
+ #include <cuda_runtime.h>
+ #include <string.h>
+ 
+ #include <algorithm>
+ #include <fstream>
+ #include <iostream>
+ #include <numeric>
+ #include <unordered_map>
+ #include <vector>
+ 
+ #include "NvInfer.h"
+ #include "NvInferRuntime.h"
+ #include "check.hpp"
+ 
+ namespace TensorRT10 {
+ 
+ static class Logger : public nvinfer1::ILogger {
+  public:
+   void log(Severity severity, const char *msg) noexcept override {
+     if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) {
+       std::cerr << "[NVINFER LOG]: " << msg << std::endl;
+     }
+   }
+ } gLogger_;
+ 
+ static std::string format_shape(const nvinfer1::Dims &shape) {
+   char buf[200] = {0};
+   char *p = buf;
+   for (int i = 0; i < shape.nbDims; ++i) {
+     if (i + 1 < shape.nbDims)
+       p += sprintf(p, "%d x ", (int)shape.d[i]);
+     else
+       p += sprintf(p, "%d", (int)shape.d[i]);
+   }
+   return buf;
+ }
+ 
+ static std::vector<uint8_t> load_file(const std::string &file) {
+   std::ifstream in(file, std::ios::in | std::ios::binary);
+   if (!in.is_open()) return {};
+ 
+   in.seekg(0, std::ios::end);
+   size_t length = in.tellg();
+ 
+   std::vector<uint8_t> data;
+   if (length > 0) {
+     in.seekg(0, std::ios::beg);
+     data.resize(length);
+ 
+     in.read((char *)&data[0], length);
+   }
+   in.close();
+   return data;
+ }
+ 
+ static const char *data_type_string(nvinfer1::DataType dt) {
+   switch (dt) {
+     case nvinfer1::DataType::kFLOAT:
+       return "float32";
+     case nvinfer1::DataType::kHALF:
+       return "float16";
+     case nvinfer1::DataType::kINT8:
+       return "int8";
+     case nvinfer1::DataType::kINT32: 
+       return "int32";
+     case nvinfer1::DataType::kBOOL: 
+       return "bool";
+     case nvinfer1::DataType::kUINT8: 
+       return "uint8";
+ 
+     #if NV_TENSORRT_MAJOR >= 10
+       case nvinfer1::DataType::kFP8: 
+         return "fp8";
+       case nvinfer1::DataType::kBF16: 
+         return "bf16";
+       case nvinfer1::DataType::kINT64: 
+         return "int64";
+       case nvinfer1::DataType::kINT4:
+         return "int4";
+     #endif
+ 
+     default:
+       return "Unknow";
+   }
+ }
+ 
+ template <typename _T>
+ static void destroy_pointer(_T *ptr) {
+   if (ptr) delete ptr;
+ }
+ 
+ class __native_engine_context {
+  public:
+   virtual ~__native_engine_context() { destroy(); }
+ 
+   bool construct(const void *pdata, size_t size, const char *message_name) {
+     destroy();
+ 
+     if (pdata == nullptr || size == 0) {
+       printf("Construct for empty data found.\n");
+       return false;
+     }
+ 
+     runtime_ = std::shared_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(gLogger_), destroy_pointer<nvinfer1::IRuntime>);
+     if (runtime_ == nullptr) {
+       printf("Failed to create tensorRT runtime: %s.\n", message_name);
+       return false;
+     }
+ 
+     engine_ = std::shared_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(pdata, size),
+                                                      destroy_pointer<nvinfer1::ICudaEngine>);
+ 
+     if (engine_ == nullptr) {
+       printf("Failed to deserialize engine: %s\n", message_name);
+       return false;
+     }
+ 
+     context_ = std::shared_ptr<nvinfer1::IExecutionContext>(engine_->createExecutionContext(),
+                                                             destroy_pointer<nvinfer1::IExecutionContext>);
+     if (context_ == nullptr) {
+       printf("Failed to create execution context: %s\n", message_name);
+       return false;
+     }
+     return context_ != nullptr;
+   }
+ 
+  private:
+   void destroy() {
+     context_.reset();
+     engine_.reset();
+     runtime_.reset();
+   }
+ 
+  public:
+   std::shared_ptr<nvinfer1::IExecutionContext> context_;
+   std::shared_ptr<nvinfer1::ICudaEngine> engine_;
+   std::shared_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
+ };
+ 
+ class EngineImplement : public Engine {
+  public:
+   std::shared_ptr<__native_engine_context> context_;
+   std::unordered_map<std::string, int> binding_name_to_index_;
+ 
+   virtual ~EngineImplement() = default;
+ 
+   bool construct(const void *data, size_t size, const char *message_name) {
+     context_ = std::make_shared<__native_engine_context>();
+     if (!context_->construct(data, size, message_name)) {
+       return false;
+     }
+ 
+     setup();
+     return true;
+   }
+ 
+   bool load(const std::string &file) {
+     auto data = load_file(file);
+     if (data.empty()) {
+       printf("An empty file has been loaded. Please confirm your file path: %s\n", file.c_str());
+       return false;
+     }
+     return this->construct(data.data(), data.size(), file.c_str());
+   }
+ 
+   void setup() {
+     auto engine = this->context_->engine_;
+     int nbBindings = engine->getNbIOTensors();
+ 
+     binding_name_to_index_.clear();
+     for (int i = 0; i < nbBindings; ++i) {
+       const char *bindingName = engine->getIOTensorName(i);
+       binding_name_to_index_[bindingName] = i;
+     }
+   }
+ 
+   virtual int index(const std::string &name) override {
+     auto iter = binding_name_to_index_.find(name);
+     Assertf(iter != binding_name_to_index_.end(), "Can not found the binding name: %s", name.c_str());
+     return iter->second;
+   }
+ 
+   virtual bool forward(const std::unordered_map<std::string, const void *> &bindings, void *stream, void *input_consum_event) override {
+     auto engine = this->context_->engine_;
+     auto context = this->context_->context_;
+     int ibinding = 0;
+     for(; ibinding < engine->getNbIOTensors(); ++ibinding){
+       auto tensor_name = engine->getIOTensorName(ibinding);
+       auto binding_iter = bindings.find(tensor_name);
+       if(binding_iter == bindings.end()){
+         printf("Failed to set the tensor address, can not found tensor %s in bindings provided.", tensor_name);
+         return false;
+       }
+ 
+       if(!context->setTensorAddress(tensor_name, (void*)binding_iter->second)){
+         printf("Failed to set tensor address for tensor %s\n", tensor_name);
+         return false;
+       }
+     }
+     return context->enqueueV3((cudaStream_t)stream);
+   }
+ 
+   virtual std::vector<int> run_dims(const std::string &name) override { return run_dims(index(name)); }
+ 
+   virtual std::vector<int> run_dims(int ibinding) override {
+     auto engine = this->context_->engine_;
+     auto context = this->context_->context_;
+     auto dim = context->getTensorShape(engine->getIOTensorName(ibinding));
+     return std::vector<int>(dim.d, dim.d + dim.nbDims);
+   }
+ 
+   virtual std::vector<int> static_dims(const std::string &name) override { return static_dims(index(name)); }
+ 
+   virtual std::vector<int> static_dims(int ibinding) override {
+     auto engine = this->context_->engine_;
+     auto dim = engine->getTensorShape(engine->getIOTensorName(ibinding));
+     return std::vector<int>(dim.d, dim.d + dim.nbDims);
+   }
+ 
+   virtual int num_bindings() override { return this->context_->engine_->getNbIOTensors(); }
+ 
+   virtual bool is_input(int ibinding) override { 
+     auto engine = this->context_->engine_;
+     return engine->getTensorIOMode(engine->getIOTensorName(ibinding)) == nvinfer1::TensorIOMode::kINPUT; 
+   }
+ 
+   virtual bool is_input(const std::string &name) override { 
+     auto engine = this->context_->engine_;
+     return engine->getTensorIOMode(name.c_str()) == nvinfer1::TensorIOMode::kINPUT; 
+   }
+ 
+   virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) override {
+     return this->set_run_dims(index(name), dims);
+   }
+ 
+   virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) override {
+     nvinfer1::Dims d;
+     for (int i = 0; i < dims.size(); ++i) {
+         d.d[i] = static_cast<int64_t>(dims[i]);  // 转换为 int64_t
+     }
+     // memcpy(d.d, dims.data(), sizeof(int) * dims.size());
+     d.nbDims = dims.size();
+     auto engine = this->context_->engine_;
+     auto context = this->context_->context_;
+     // return context->setInputShape("images", nvinfer1::Dims{4, {1, 3, 640, 640}});
+     return context->setInputShape(engine->getIOTensorName(ibinding), d);
+   }
+ 
+   virtual int numel(const std::string &name) override { return numel(index(name)); }
+ 
+   virtual int numel(int ibinding) override {
+     auto dim = this->run_dims(ibinding);
+     return std::accumulate(dim.begin(), dim.end(), 1, std::multiplies<int>());
+   }
+ 
+   virtual DType dtype(const std::string &name) override { return dtype(index(name)); }
+ 
+   virtual DType dtype(int ibinding) override { 
+     auto engine = this->context_->engine_;
+     auto dtype = engine->getTensorDataType(engine->getIOTensorName(ibinding));
+     switch(dtype){
+       case nvinfer1::DataType::kFLOAT: return DType::FLOAT;
+       case nvinfer1::DataType::kHALF: return DType::HALF;
+       case nvinfer1::DataType::kINT8: return DType::INT8;
+       case nvinfer1::DataType::kINT32: return DType::INT32;
+       case nvinfer1::DataType::kBOOL: return DType::BOOL;
+       case nvinfer1::DataType::kUINT8: return DType::UINT8;
+ 
+       #if NV_TENSORRT_MAJOR >= 10
+         case nvinfer1::DataType::kFP8: return DType::FP8;
+         case nvinfer1::DataType::kBF16: return DType::BF16;
+         case nvinfer1::DataType::kINT64: return DType::INT64;
+         case nvinfer1::DataType::kINT4: return DType::INT4;
+       #endif
+ 
+       default: return DType::NONE;
+     }
+   }
+ 
+   virtual bool has_dynamic_dim() override {
+     // check if any input or output bindings have dynamic shapes
+     // code from ChatGPT
+     int numBindings = this->num_bindings();
+     for (int i = 0; i < numBindings; ++i) {
+       auto dims = this->static_dims(i);
+       for (size_t j = 0; j < dims.size(); ++j) {
+         if (dims[j] == -1) return true;
+       }
+     }
+     return false;
+   }
+ 
+   virtual void print(const char *name) override {
+     printf("------------------------------------------------------\n");
+     printf("%s 🌱 is %s model\n", name, has_dynamic_dim() ? "Dynamic Shape" : "Static Shape");
+ 
+     int num_input = 0;
+     int num_output = 0;
+     auto engine = this->context_->engine_;
+     for (int i = 0; i < this->num_bindings(); ++i) {
+       if (this->is_input(i))
+         num_input++;
+       else
+         num_output++;
+     }
+ 
+     printf("Inputs: %d\n", num_input);
+     for (int i = 0; i < num_input; ++i) {
+       auto name = engine->getIOTensorName(i);
+       auto dim = engine->getTensorShape(name);
+       auto dtype = engine->getTensorDataType(name);
+       printf("\t%d.%s : {%s} [%s]\n", i, name, format_shape(dim).c_str(), data_type_string(dtype));
+     }
+ 
+     printf("Outputs: %d\n", num_output);
+     for (int i = 0; i < num_output; ++i) {
+       auto name = engine->getIOTensorName(i + num_input);
+       auto dim = engine->getTensorShape(name);
+       auto dtype = engine->getTensorDataType(name);
+       printf("\t%d.%s : {%s} [%s]\n", i, name, format_shape(dim).c_str(), data_type_string(dtype));
+     }
+     printf("------------------------------------------------------\n");
+   }
+ };
+ 
+ std::shared_ptr<Engine> load(const std::string &file) {
+   std::shared_ptr<EngineImplement> impl(new EngineImplement());
+   if (!impl->load(file)) impl.reset();
+   return impl;
+ }
+ 
+ };  // namespace TensorRT

+ 61 - 0
src/common/tensorrt.hpp

@@ -0,0 +1,61 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+ #ifndef __TENSORRT_HPP__
+ #define __TENSORRT_HPP__
+ 
+ #include <memory>
+ #include <string>
+ #include <vector>
+ #include <unordered_map>
+ 
+ namespace TensorRT10 {
+ 
+ enum class DType : int { FLOAT = 0, HALF = 1, INT8 = 2, INT32 = 3, BOOL = 4, UINT8 = 5, FP8 = 6, BF16 = 7, INT64 = 8, INT4 = 9, NONE=-1 };
+ 
+ class Engine {
+  public:
+   virtual ~Engine() = default;
+   virtual bool forward(const std::unordered_map<std::string, const void *> &bindings, void *stream = nullptr, void *input_consum_event = nullptr) = 0;
+   virtual int index(const std::string &name) = 0;
+   virtual std::vector<int> run_dims(const std::string &name) = 0;
+   virtual std::vector<int> run_dims(int ibinding) = 0;
+   virtual std::vector<int> static_dims(const std::string &name) = 0;
+   virtual std::vector<int> static_dims(int ibinding) = 0;
+   virtual int numel(const std::string &name) = 0;
+   virtual int numel(int ibinding) = 0;
+   virtual int num_bindings() = 0;
+   virtual bool is_input(int ibinding) = 0;
+   virtual bool is_input(const std::string &name) = 0;
+   virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) = 0;
+   virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) = 0;
+   virtual DType dtype(const std::string &name) = 0;
+   virtual DType dtype(int ibinding) = 0;
+   virtual bool has_dynamic_dim() = 0;
+   virtual void print(const char *name = "TensorRT-Engine") = 0;
+ };
+ 
+ std::shared_ptr<Engine> load(const std::string &file);
+ };  // namespace TensorRT
+ 
+ #endif  // __TENSORRT_HPP__

+ 301 - 0
src/common/tensorrt8.cpp

@@ -0,0 +1,301 @@
+#include "common/tensorrt8.hpp"
+#include "common/check.hpp"
+#include <iostream>
+#include <cstring>
+#include <NvInfer.h>
+#include <cuda_runtime.h>
+#include <stdarg.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include <unordered_map>
+
+
+namespace TensorRT8
+{
+
+using namespace std;
+using namespace nvinfer1;
+
+static class Logger : public nvinfer1::ILogger {
+ public:
+  void log(Severity severity, const char *msg) noexcept override {
+    if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) {
+      std::cerr << "[NVINFER LOG]: " << msg << std::endl;
+    }
+  }
+} gLogger_;
+
+template <typename _T>
+static void destroy_nvidia_pointer(_T *ptr)
+{
+    if (ptr) ptr->destroy();
+}
+
+static std::string format_shape(const Dims &shape) 
+{
+    stringstream output;
+    char buf[64];
+    const char *fmts[] = {"%d", "x%d"};
+    for (int i = 0; i < shape.nbDims; ++i)
+    {
+        snprintf(buf, sizeof(buf), fmts[i != 0], shape.d[i]);
+        output << buf;
+    }
+  return output.str();
+}
+
+static std::vector<uint8_t> load_file(const string &file) 
+{
+    ifstream in(file, ios::in | ios::binary);
+    if (!in.is_open()) return {};
+
+    in.seekg(0, ios::end);
+    size_t length = in.tellg();
+
+    std::vector<uint8_t> data;
+    if (length > 0) 
+    {
+        in.seekg(0, ios::beg);
+        data.resize(length);
+
+        in.read((char *)&data[0], length);
+    }
+    in.close();
+    return data;
+}
+
+class __native_engine_context 
+{
+public:
+    virtual ~__native_engine_context() { destroy(); }
+
+    bool construct(const void *pdata, size_t size) 
+    {
+        destroy();
+
+        if (pdata == nullptr || size == 0) return false;
+
+        runtime_ = shared_ptr<IRuntime>(createInferRuntime(gLogger_), destroy_nvidia_pointer<IRuntime>);
+        if (runtime_ == nullptr) return false;
+
+        engine_ = shared_ptr<ICudaEngine>(runtime_->deserializeCudaEngine(pdata, size, nullptr),
+                                        destroy_nvidia_pointer<ICudaEngine>);
+        if (engine_ == nullptr) return false;
+
+        context_ = shared_ptr<IExecutionContext>(engine_->createExecutionContext(),
+                                                destroy_nvidia_pointer<IExecutionContext>);
+        return context_ != nullptr;
+    }
+
+private:
+    void destroy() 
+    {
+        context_.reset();
+        engine_.reset();
+        runtime_.reset();
+    }
+
+public:
+    shared_ptr<IExecutionContext> context_;
+    shared_ptr<ICudaEngine> engine_;
+    shared_ptr<IRuntime> runtime_ = nullptr;
+};
+
+class EngineImplement : public Engine 
+{
+public:
+    shared_ptr<__native_engine_context> context_;
+    unordered_map<string, int> binding_name_to_index_;
+
+    virtual ~EngineImplement() = default;
+
+    bool construct(const void *data, size_t size) 
+    {
+        context_ = make_shared<__native_engine_context>();
+        if (!context_->construct(data, size)) 
+        {
+            return false;
+        }
+
+        setup();
+        return true;
+    }
+
+    bool load(const string &file) 
+    {
+        auto data = load_file(file);
+        if (data.empty()) 
+        {
+            printf("An empty file has been loaded. Please confirm your file path: %s\n", file.c_str());
+            return false;
+        }
+        return this->construct(data.data(), data.size());
+    }
+
+    void setup() 
+    {
+        auto engine = this->context_->engine_;
+        int nbBindings = engine->getNbBindings();
+
+        binding_name_to_index_.clear();
+        for (int i = 0; i < nbBindings; ++i) 
+        {
+            const char *bindingName = engine->getBindingName(i);
+            binding_name_to_index_[bindingName] = i;
+        }
+    }
+
+    virtual int index(const std::string &name) override 
+    {
+        auto iter = binding_name_to_index_.find(name);
+        Assertf(iter != binding_name_to_index_.end(), "Can not found the binding name: %s",
+                name.c_str());
+        return iter->second;
+    }
+
+    virtual bool forward(const std::vector<void *> &bindings, void *stream,
+                        void *input_consum_event) override 
+    {
+        return this->context_->context_->enqueueV2((void**)bindings.data(), (cudaStream_t)stream,
+                                                    (cudaEvent_t *)input_consum_event);
+    }
+
+    virtual std::vector<int> run_dims(const std::string &name) override 
+    {
+        return run_dims(index(name));
+    }
+
+    virtual std::vector<int> run_dims(int ibinding) override
+    {
+        auto dim = this->context_->context_->getBindingDimensions(ibinding);
+        return std::vector<int>(dim.d, dim.d + dim.nbDims);
+    }
+
+    virtual std::vector<int> static_dims(const std::string &name) override 
+    {
+        return static_dims(index(name));
+    }
+
+    virtual std::vector<int> static_dims(int ibinding) override 
+    {
+        auto dim = this->context_->engine_->getBindingDimensions(ibinding);
+        return std::vector<int>(dim.d, dim.d + dim.nbDims);
+    }
+
+    virtual int num_bindings() override { return this->context_->engine_->getNbBindings(); }
+
+    virtual bool is_input(int ibinding) override 
+    {
+        return this->context_->engine_->bindingIsInput(ibinding);
+    }
+
+    virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) override 
+    {
+        return this->set_run_dims(index(name), dims);
+    }
+
+    virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) override 
+    {
+        Dims d;
+        memcpy(d.d, dims.data(), sizeof(int) * dims.size());
+        d.nbDims = dims.size();
+        return this->context_->context_->setBindingDimensions(ibinding, d);
+    }
+
+    virtual int numel(const std::string &name) override { return numel(index(name)); }
+
+    virtual int numel(int ibinding) override 
+    {
+        auto dim = this->context_->context_->getBindingDimensions(ibinding);
+        return std::accumulate(dim.d, dim.d + dim.nbDims, 1, std::multiplies<int>());
+    }
+
+    virtual DType dtype(const std::string &name) override { return dtype(index(name)); }
+
+    virtual DType dtype(int ibinding) override 
+    {
+        return (DType)this->context_->engine_->getBindingDataType(ibinding);
+    }
+
+    virtual bool has_dynamic_dim() override 
+    {
+        // check if any input or output bindings have dynamic shapes
+        // code from ChatGPT
+        int numBindings = this->context_->engine_->getNbBindings();
+        for (int i = 0; i < numBindings; ++i) 
+        {
+            nvinfer1::Dims dims = this->context_->engine_->getBindingDimensions(i);
+            for (int j = 0; j < dims.nbDims; ++j) 
+            {
+                if (dims.d[j] == -1) return true;
+            }
+        }
+        return false;
+    }
+
+    virtual void print() override 
+    {
+        printf("------------------------------------------------------\n");
+        printf("Engine %p [%s]\n", this, has_dynamic_dim() ? "DynamicShape" : "StaticShape");
+
+        int num_input = 0;
+        int num_output = 0;
+        auto engine = this->context_->engine_;
+        for (int i = 0; i < engine->getNbBindings(); ++i) 
+        {
+            if (engine->bindingIsInput(i))
+            num_input++;
+            else
+            num_output++;
+        }
+
+        printf("Inputs: %d\n", num_input);
+        for (int i = 0; i < num_input; ++i) 
+        {
+            auto name = engine->getBindingName(i);
+            auto dim = engine->getBindingDimensions(i);
+            printf("\t%d.%s : shape {%s}\n", i, name, format_shape(dim).c_str());
+        }
+
+        printf("Outputs: %d\n", num_output);
+        for (int i = 0; i < num_output; ++i) 
+        {
+            auto name = engine->getBindingName(i + num_input);
+            auto dim = engine->getBindingDimensions(i + num_input);
+            printf("\t%d.%s : shape {%s}\n", i, name, format_shape(dim).c_str());
+        }
+    }
+
+};
+
+Engine *loadraw(const std::string &file) 
+{
+    EngineImplement *impl = new EngineImplement();
+    if (!impl->load(file)) 
+    {
+        delete impl;
+        impl = nullptr;
+    }
+    return impl;
+}
+
+std::shared_ptr<Engine> load(const std::string &file) 
+{
+    return std::shared_ptr<EngineImplement>((EngineImplement *)loadraw(file));
+}
+
+std::string format_shape(const std::vector<int> &shape) 
+{
+    stringstream output;
+    char buf[64];
+    const char *fmts[] = {"%d", "x%d"};
+    for (int i = 0; i < (int)shape.size(); ++i) 
+    {
+        snprintf(buf, sizeof(buf), fmts[i != 0], shape[i]);
+        output << buf;
+    }
+    return output.str();
+}
+
+}

+ 38 - 0
src/common/tensorrt8.hpp

@@ -0,0 +1,38 @@
+#ifndef TENSORRT8_HPP__
+#define TENSORRT8_HPP__
+#include <vector>
+#include "common/memory.hpp"
+
+namespace TensorRT8
+{
+
+enum class DType : int { FLOAT = 0, HALF = 1, INT8 = 2, INT32 = 3, BOOL = 4, UINT8 = 5 };
+
+class Engine 
+{
+public:
+    virtual bool forward(const std::vector<void *> &bindings, void *stream = nullptr,
+                        void *input_consum_event = nullptr) = 0;
+    virtual int index(const std::string &name) = 0;
+    virtual std::vector<int> run_dims(const std::string &name) = 0;
+    virtual std::vector<int> run_dims(int ibinding) = 0;
+    virtual std::vector<int> static_dims(const std::string &name) = 0;
+    virtual std::vector<int> static_dims(int ibinding) = 0;
+    virtual int numel(const std::string &name) = 0;
+    virtual int numel(int ibinding) = 0;
+    virtual int num_bindings() = 0;
+    virtual bool is_input(int ibinding) = 0;
+    virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) = 0;
+    virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) = 0;
+    virtual DType dtype(const std::string &name) = 0;
+    virtual DType dtype(int ibinding) = 0;
+    virtual bool has_dynamic_dim() = 0;
+    virtual void print() = 0;
+};
+
+std::shared_ptr<Engine> load(const std::string &file);
+std::string format_shape(const std::vector<int> &shape);
+
+}  // namespace trt
+
+#endif

+ 1 - 1
src/common/utils.cpp

@@ -12,4 +12,4 @@ std::string getTimeString()
     std::ostringstream oss;
     oss << std::put_time(&tm, "%Y_%m_%d_%H_%M_%S");
     return oss.str();
-}
+}

+ 12 - 0
src/common/utils.hpp

@@ -5,5 +5,17 @@
 
 std::string getTimeString();
 
+template<typename ... Args>
+static std::string str_format(const std::string &format, Args ... args)
+{
+	auto size_buf = std::snprintf(nullptr, 0, format.c_str(), args ...) + 1; 
+	std::unique_ptr<char[]> buf(new(std::nothrow) char[size_buf]);
+
+	if (!buf)
+		return std::string("");
+
+	std::snprintf(buf.get(), size_buf, format.c_str(), args ...);
+	return std::string(buf.get(), buf.get() + size_buf - 1); 
+}
 
 #endif

+ 0 - 154
src/infer/cpm.hpp

@@ -1,154 +0,0 @@
-#ifndef __CPM_HPP__
-#define __CPM_HPP__
-
-// Comsumer Producer Model
-
-#include <algorithm>
-#include <condition_variable>
-#include <future>
-#include <memory>
-#include <queue>
-#include <thread>
-
-namespace cpm {
-
-template <typename Result, typename Input, typename Model>
-class Instance {
- protected:
-  struct Item {
-    Input input;
-    std::shared_ptr<std::promise<Result>> pro;
-  };
-
-  std::condition_variable cond_;
-  std::queue<Item> input_queue_;
-  std::mutex queue_lock_;
-  std::shared_ptr<std::thread> worker_;
-  volatile bool run_ = false;
-  volatile int max_items_processed_ = 0;
-  void *stream_ = nullptr;
-
- public:
-  virtual ~Instance() { stop(); }
-
-  void stop() {
-    run_ = false;
-    cond_.notify_one();
-    {
-      std::unique_lock<std::mutex> l(queue_lock_);
-      while (!input_queue_.empty()) {
-        auto &item = input_queue_.front();
-        if (item.pro) item.pro->set_value(Result());
-        input_queue_.pop();
-      }
-    };
-
-    if (worker_) {
-      worker_->join();
-      worker_.reset();
-    }
-  }
-
-  virtual std::shared_future<Result> commit(const Input &input) {
-    Item item;
-    item.input = input;
-    item.pro.reset(new std::promise<Result>());
-    {
-      std::unique_lock<std::mutex> __lock_(queue_lock_);
-      input_queue_.push(item);
-    }
-    cond_.notify_one();
-    return item.pro->get_future();
-  }
-
-  virtual std::vector<std::shared_future<Result>> commits(const std::vector<Input> &inputs) {
-    std::vector<std::shared_future<Result>> output;
-    {
-      std::unique_lock<std::mutex> __lock_(queue_lock_);
-      for (int i = 0; i < (int)inputs.size(); ++i) {
-        Item item;
-        item.input = inputs[i];
-        item.pro.reset(new std::promise<Result>());
-        output.emplace_back(item.pro->get_future());
-        input_queue_.push(item);
-      }
-    }
-    cond_.notify_one();
-    return output;
-  }
-
-  template <typename LoadMethod>
-  bool start(const LoadMethod &loadmethod, int max_items_processed = 1, void *stream = nullptr) {
-    stop();
-
-    this->stream_ = stream;
-    this->max_items_processed_ = max_items_processed;
-    std::promise<bool> status;
-    worker_ = std::make_shared<std::thread>(&Instance::worker<LoadMethod>, this,
-                                            std::ref(loadmethod), std::ref(status));
-    return status.get_future().get();
-  }
-
- private:
-  template <typename LoadMethod>
-  void worker(const LoadMethod &loadmethod, std::promise<bool> &status) {
-    std::shared_ptr<Model> model = loadmethod();
-    if (model == nullptr) {
-      status.set_value(false);
-      return;
-    }
-
-    run_ = true;
-    status.set_value(true);
-
-    std::vector<Item> fetch_items;
-    std::vector<Input> inputs;
-    while (get_items_and_wait(fetch_items, max_items_processed_)) {
-      inputs.resize(fetch_items.size());
-      std::transform(fetch_items.begin(), fetch_items.end(), inputs.begin(),
-                     [](Item &item) { return item.input; });
-
-      auto ret = model->forwards(inputs, stream_);
-      for (int i = 0; i < (int)fetch_items.size(); ++i) {
-        if (i < (int)ret.size()) {
-          fetch_items[i].pro->set_value(ret[i]);
-        } else {
-          fetch_items[i].pro->set_value(Result());
-        }
-      }
-      inputs.clear();
-      fetch_items.clear();
-    }
-    model.reset();
-    run_ = false;
-  }
-
-  virtual bool get_items_and_wait(std::vector<Item> &fetch_items, int max_size) {
-    std::unique_lock<std::mutex> l(queue_lock_);
-    cond_.wait(l, [&]() { return !run_ || !input_queue_.empty(); });
-
-    if (!run_) return false;
-
-    fetch_items.clear();
-    for (int i = 0; i < max_size && !input_queue_.empty(); ++i) {
-      fetch_items.emplace_back(std::move(input_queue_.front()));
-      input_queue_.pop();
-    }
-    return true;
-  }
-
-  virtual bool get_item_and_wait(Item &fetch_item) {
-    std::unique_lock<std::mutex> l(queue_lock_);
-    cond_.wait(l, [&]() { return !run_ || !input_queue_.empty(); });
-
-    if (!run_) return false;
-
-    fetch_item = std::move(input_queue_.front());
-    input_queue_.pop();
-    return true;
-  }
-};
-
-}  // namespace cpm
-
-#endif  // __CPM_HPP__

+ 23 - 0
src/infer/infer.cpp

@@ -0,0 +1,23 @@
+#include "infer/infer.hpp"
+
+#include "infer/trt/yolo.hpp"
+#include <iostream>
+#include <memory>
+
+
+std::shared_ptr<Infer> load(const std::string& model_path, ModelType model_type, const std::vector<std::string>& names, int gpu_id, float confidence_threshold, float nms_threshold)
+{
+    std::shared_ptr<Infer> infer;
+    switch (model_type)
+    {
+    case ModelType::YOLOV5:
+    case ModelType::YOLOV8:
+    case ModelType::YOLO11:
+    case ModelType::YOLO11POSE:
+        infer = yolo::load_yolo(model_path, model_type, gpu_id, confidence_threshold, nms_threshold);
+        break;
+    default:
+        break;
+    }
+    return infer;
+}

+ 2 - 1
src/infer/infer.hpp

@@ -15,7 +15,8 @@ enum class ModelType : int{
 
 class Infer{
 public:
-    virtual data::BoxArray forward() = 0;
+    virtual data::BoxArray forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream = nullptr) = 0;
+    virtual data::BoxArray forward(const tensor::Image &image, void *stream = nullptr) = 0;
 }
 
 std::shared_ptr<Infer> load(const std::string& model_path, ModelType model_type, const std::vector<std::string>& names={}, int gpu_id=0, float confidence_threshold=0.5f, float nms_threshold=0.45f);

+ 0 - 184
src/infer/opencv/yolov5.cpp

@@ -1,184 +0,0 @@
-#ifndef YOLO_HPP__
-#define YOLO_HPP__
-
-#include "common/data.hpp"
-#include "infer/infer.hpp"
-
-#include "opencv2/opencv.hpp"
-#include "opencv2/dnn.hpp"
-#include <memory>
-#include <string>
-#include <vector>
-#include <iostream>
-
-
-namespace yolo
-{
-
-class Yolov5InferImpl : public Infer
-{
-public:
-    ModelType model_type;
-    std::shared_ptr<cv::dnn::Net> net_;
-
-    float confidence_threshold_;
-    float nms_threshold_;
-    int network_input_width_, network_input_height_;
-
-
-
-    std::vector<std::string> names_;
-
-    bool load(const std::string& model_path, const std::vector<std::string>& names, float confidence_threshold=0.5f, float nms_threshold=0.45f)
-    {
-        net_ = std::make_shared<cv::dnn::Net>(cv::dnn::readNet(model_path));
-        // 获取模型输入层名称
-        std::vector<std::string> inputNames = net->getLayerNames();
-        
-        // 获取输入层的形状信息
-        std::vector<std::vector<int>> inShapes, outShapes;
-        net.getLayerShapes(cv::dnn::Dict(), 0, inShapes, outShapes);
-
-        if (!inShapes.empty()) {
-            int batchSize = inShapes[0][0]; // 批次大小(通常为1)
-            int channels = inShapes[0][1];  // 通道数
-            network_input_height_ = inShapes[0][2];    // 高度
-            network_input_width_ = inShapes[0][3];     // 宽度
-
-            std::cout << "Model Input Shape: " << batchSize << "x" 
-                    << channels << "x" << network_input_height_ << "x" << network_input_width_ << std::endl;
-        } else {
-            std::cout << "Failed to get input shape!" << std::endl;
-        }
-        return true;
-    }
-
-    void warpAffine(cv::Mat& src_image, cv::Mat& dst_image, float *d2i)
-    {
-        int src_image_width  = src_image.cols;
-        int src_image_height = src_image.rows;
-
-        float scale_x = network_input_width_  / (float)src_image_width;
-        float scale_y = network_input_height_ / (float)src_image_height;
-        float scale   = std::min(scale_x, scale_y);
-        float i2d[6];
-        i2d[0] = scale;
-        i2d[1] = 0;
-        i2d[2] = (-scale * src_image_width + network_input_width_ + scale - 1) * 0.5;
-        i2d[3] = 0;
-        i2d[4] = scale;
-        i2d[5] = (-scale * src_image_height + network_input_height_ + scale - 1) * 0.5;
-
-        cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
-        cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
-        cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);
-
-        dst_image.create(network_input_height_, network_input_width_, CV_8UC3);
-        cv::warpAffine(src_image, dst_image, m2x3_i2d, dst_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));
-    }
-
-    void decode(std::vector<cv::Mat>& outs,
-        data::BoxArray& result_boxes,
-        float *d2i,
-        int src_image_width,
-        int src_image_height)
-    {
-        data::BoxArray boxes;
-        int cols = outs[0].size[2];
-        int rows = outs[0].size[1];
-        float* predict = (float*)outs[0].data;
-        int num_classes = cols - 5;
-    
-        for(int i = 0; i < rows; ++i)
-        {
-            float* pitem = predict + i * cols;
-            float objness = pitem[4];
-            if (objness < confidence_threshold_)
-            {
-                continue;
-            }
-    
-            float* pclass = pitem + 5;
-            int label     = std::max_element(pclass, pclass + num_classes) - pclass;
-            float prob    = pclass[label];
-            float confidence = prob * objness;
-            if(confidence < confidence_threshold_)
-            {
-                continue;
-            }
-            float cx     = pitem[0];
-            float cy     = pitem[1];
-            float width  = pitem[2];
-            float height = pitem[3];
-    
-            // 通过反变换恢复到图像尺度
-            float left   = (cx - width * 0.5) * d2i[0] + d2i[2];
-            float top    = (cy - height * 0.5) * d2i[0] + d2i[5];
-            float right  = (cx + width * 0.5) * d2i[0] + d2i[2];
-            float bottom = (cy + height * 0.5) * d2i[0] + d2i[5];
-            boxes.emplace_back(left, top, right, bottom, confidence, names_[label]);
-        }
-        std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){return a.confidence > b.confidence;});
-        std::vector<bool> remove_flags(boxes.size());
-        result_boxes.reserve(boxes.size());
-    
-        auto iou = [](const Box& a, const Box& b){
-            int cross_left   = std::max(a.left, b.left);
-            int cross_top    = std::max(a.top, b.top);
-            int cross_right  = std::min(a.right, b.right);
-            int cross_bottom = std::min(a.bottom, b.bottom);
-    
-            int cross_area = std::max(0, cross_right - cross_left) * std::max(0, cross_bottom - cross_top);
-            int union_area = std::max(0.f, a.right - a.left) * std::max(0.f, a.bottom - a.top)
-                            + std::max(0.f, b.right - b.left) * std::max(0.f, b.bottom - b.top) - cross_area;
-            if(cross_area == 0 || union_area == 0) return 0.0f;
-            return 1.0f * cross_area / union_area;
-        };
-    
-        for(int i = 0; i < boxes.size(); ++i)
-        {
-            if(remove_flags[i]) continue;
-    
-            auto& ibox = boxes[i];
-            result_boxes.emplace_back(ibox);
-            for (int j = i + 1; j < boxes.size(); ++j)
-            {
-                if (remove_flags[j]) continue;
-    
-                auto& jbox = boxes[j];
-                if (ibox.class_id == jbox.class_id)
-                {
-                    // class matched
-                    if (iou(ibox, jbox) >= nms_threshold_)
-                        remove_flags[j] = true;
-                }
-            }
-        }
-    
-    }
-
-    virtual data::BoxArray forward(cv::Mat& image) override
-    {
-        float d2i[6];
-        cv::Mat affine_image;
-        warpAffine(image, affine_image, d2i);
-        std::vector<cv::Mat> outs;
-        auto blob = cv::dnn::blobFromImage(affine_image, 1 / 255.0, cv::Size(network_input_height_, network_input_width_), cv::Scalar(0, 0, 0), true, false);
-        net_->setInput(blob);
-        net_->forward(outs, net_.getUnconnectedOutLayersNames());
-        data::BoxArray result;
-        decode(outs, result, d2i, image.cols, image.rows);
-        return result;
-    }
-};
-
-std::shared_ptr<Infer> load(const std::string &engine_file, ModelType model_type, const std::vector<std::string>& names, int gpu_id, float confidence_threshold, float nms_threshold)
-{
-    // checkRuntime(cudaSetDevice(gpu_id));
-    return std::shared_ptr<Yolov5InferImpl>((Yolov5InferImpl *)(new Yolov5InferImpl(engine_file, names, confidence_threshold, nms_threshold)));
-}
-
-
-}
-
-#endif // YOLO_HPP__

+ 297 - 0
src/infer/slice/slice.cu

@@ -0,0 +1,297 @@
+#include "infer/slice/slice.hpp"
+#include "common/check.hpp"
+#include <cmath>
+
+static __global__ void slice_kernel(
+  const uchar3* __restrict__ image,
+  uchar3* __restrict__ outs,
+  const int width,
+  const int height,
+  const int slice_width,
+  const int slice_height,
+  const int slice_num_h,
+  const int slice_num_v,
+  const int* __restrict__ slice_start_point)
+{
+    const int slice_idx = blockIdx.z;
+
+    const int start_x = slice_start_point[slice_idx * 2];
+    const int start_y = slice_start_point[slice_idx * 2 + 1];
+
+    // 当前像素在切片内的相对位置
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(x >= slice_width || y >= slice_height) 
+        return;
+
+    const int dx = start_x + x;
+    const int dy = start_y + y;
+
+    if(dx >= width || dy >= height) 
+        return;
+
+    // 读取像素
+    const int src_index = dy * width + dx;
+    const uchar3 pixel = image[src_index];
+
+    // 写入切片
+    const int dst_index = slice_idx * slice_width * slice_height + y * slice_width + x;
+    outs[dst_index] = pixel;
+}
+
+static void slice_plane(const uint8_t* image,
+    uint8_t* outs,
+    int* slice_start_point,
+    const int width,
+    const int height,
+    const int slice_width,
+    const int slice_height,
+    const int slice_num_h,
+    const int slice_num_v,
+    void* stream=nullptr)
+{
+    int slice_total = slice_num_h * slice_num_v;
+    cudaStream_t stream_ = (cudaStream_t)stream;
+    dim3 block(32, 32);
+    dim3 grid(
+        (slice_width + block.x - 1) / block.x,
+        (slice_height + block.y - 1) / block.y,
+        slice_total
+    );
+    slice_kernel<<<grid, block, 0, stream_>>>(
+        reinterpret_cast<const uchar3*>(image),
+        reinterpret_cast<uchar3*>(outs),
+        width, height, 
+        slice_width, slice_height, 
+        slice_num_h, slice_num_v, 
+        slice_start_point
+    );
+}
+
+
+namespace slice
+{
+
+int calculateNumCuts(int dimension, int subDimension, float overlapRatio) {
+    float step = subDimension * (1 - overlapRatio);
+    if(step == 0)
+    {
+        return 1;
+    }
+    float cuts = static_cast<float>(dimension - subDimension) / step;
+    // 浮点数会有很小的误差,直接向上取整会出现多裁剪了一张图的情况
+    if (fabs(cuts - round(cuts)) < 0.0001) {
+        cuts = round(cuts);
+    }
+    int numCuts = static_cast<int>(std::ceil(cuts));
+    return numCuts + 1;
+}
+
+static int calc_resolution_factor(int resolution)
+{
+    int expo = 0;
+    while(pow(2, expo) < resolution) expo++;
+    return expo - 1;
+} 
+
+static std::string calc_aspect_ratio_orientation(int width, int height)
+{
+    if (width < height)
+        return  "vertical";
+    else if(width > height)
+        return "horizontal";
+    else
+        return "square";
+}
+
+static std::tuple<int, int, float, float> calc_ratio_and_slice(const std::string& orientation, int slide=1, float ratio=0.1)
+{
+    int slice_row, slice_col;
+    float overlap_height_ratio, overlap_width_ratio;
+    if (orientation == "vertical")
+    {
+        slice_row = slide;
+        slice_col = slide * 2;
+        overlap_height_ratio = ratio;
+        overlap_width_ratio = ratio;
+    }
+    else if (orientation == "horizontal")
+    {
+        slice_row = slide * 2;
+        slice_col = slide;
+        overlap_height_ratio = ratio;
+        overlap_width_ratio = ratio;
+    }
+    else if (orientation == "square")
+    {
+        slice_row = slide;
+        slice_col = slide;
+        overlap_height_ratio = ratio;
+        overlap_width_ratio = ratio;
+    }
+    return std::make_tuple(slice_row, slice_col, overlap_height_ratio, overlap_width_ratio);
+}
+
+static std::tuple<int, int, float, float> calc_slice_and_overlap_params(
+    const std::string& resolution, int width, int height, std::string orientation)
+{
+    int split_row, split_col;
+    float overlap_height_ratio, overlap_width_ratio;
+    if (resolution == "medium")
+        std::tie(split_row, split_col, overlap_height_ratio, overlap_width_ratio) = calc_ratio_and_slice(
+            orientation, 1, 0.8
+        );
+
+    else if (resolution == "high")
+        std::tie(split_row, split_col, overlap_height_ratio, overlap_width_ratio) = calc_ratio_and_slice(
+            orientation, 2, 0.4
+        );
+
+    else if (resolution == "ultra-high")
+        std::tie(split_row, split_col, overlap_height_ratio, overlap_width_ratio) = calc_ratio_and_slice(
+            orientation, 4, 0.4
+        );
+    else
+    {
+        split_col = 1;
+        split_row = 1;
+        overlap_width_ratio = 1;
+        overlap_height_ratio = 1;
+    }
+    int slice_height = height / split_col;
+    int slice_width = width / split_row;
+    return std::make_tuple(slice_width, slice_height, overlap_height_ratio, overlap_width_ratio);
+}
+
+static std::tuple<int, int, float, float> get_resolution_selector(const std::string& resolution, int width, int height)
+{
+    std::string orientation = calc_aspect_ratio_orientation(width, height);
+    return calc_slice_and_overlap_params(resolution, width, height, orientation);
+
+}
+
+static std::tuple<int, int, float, float> get_auto_slice_params(int width, int height)
+{
+    int resolution = height * width;
+    int factor = calc_resolution_factor(resolution);
+    if (factor <= 18)
+        return get_resolution_selector("low", width, height);
+    else if (18 <= factor && factor < 21)
+        return get_resolution_selector("medium", width, height);
+    else if (21 <= factor && factor < 24)
+        return get_resolution_selector("high", width, height);
+    else
+        return get_resolution_selector("ultra-high", width, height);
+}
+
+void SliceImage::autoSlice(
+        const tensor::Image& image,
+        void* stream)
+{
+    int slice_width;
+    int slice_height;
+    float overlap_width_ratio;
+    float overlap_height_ratio;
+    std::tie(slice_width, slice_height, overlap_width_ratio, overlap_height_ratio) = get_auto_slice_params(image.width, image.height);
+    slice(image, slice_width, slice_height, overlap_width_ratio, overlap_height_ratio, stream);
+}
+
+void SliceImage::slice(
+        const tensor::Image& image, 
+        const int slice_width,
+        const int slice_height,
+        const float overlap_width_ratio,
+        const float overlap_height_ratio,
+        void* stream)
+{
+    slice_width_  = slice_width;
+    slice_height_ = slice_height;
+    cudaStream_t stream_ = (cudaStream_t)stream;
+
+    int width = image.width;
+    int height = image.height;
+
+    slice_num_h_ = calculateNumCuts(width, slice_width, overlap_width_ratio);
+    slice_num_v_ = calculateNumCuts(height, slice_height, overlap_height_ratio);
+    /*
+    printf("------------------------------------------------------\n"
+           "CUDA SAHI CROP IMAGE ✂️\n"
+         "------------------------------------------------------\n"
+         "%-30s: %-10d\n"
+         "%-30s: %-10d\n"
+         "%-30s: %-10.2f\n"
+         "%-30s: %-10.2f\n"
+         "%-30s: %-10d\n"
+         "%-30s: %-10d\n"
+         "------------------------------------------------------\n", 
+         "Slice width", slice_width_,
+         "Slice height", slice_height_,
+         "Overlap width ratio", overlap_width_ratio,
+         "Overlap height ratio", overlap_height_ratio,
+         "Number of horizontal cuts", slice_num_h_,
+         "Number of vertical cuts", slice_num_v_);
+    */
+    int slice_num            = slice_num_h_ * slice_num_v_;
+    int overlap_width_pixel  = slice_width  * overlap_width_ratio;
+    int overlap_height_pixel = slice_height * overlap_height_ratio;
+
+    size_t size_image = 3 * width * height;
+    size_t output_img_size = 3 * slice_width * slice_height;
+
+    input_image_.gpu(size_image);
+    output_images_.gpu(slice_num * output_img_size);
+    checkRuntime(cudaMemsetAsync(output_images_.gpu(), 114, output_images_.gpu_bytes(), stream_));
+
+    checkRuntime(cudaMemcpyAsync(input_image_.gpu(), image.bgrptr, size_image, cudaMemcpyHostToDevice, stream_));
+    // checkRuntime(cudaStreamSynchronize(stream_));
+
+    uint8_t* input_device = input_image_.gpu();
+    uint8_t* output_device = output_images_.gpu();
+
+    slice_start_point_.cpu(slice_num * 2);
+    slice_start_point_.gpu(slice_num * 2);
+
+    int* slice_start_point_ptr = slice_start_point_.cpu();
+    
+    for (int i = 0; i < slice_num_h_; i++)
+    {
+        int x = std::min(width - slice_width, std::max(0, i * (slice_width - overlap_width_pixel)));
+        for (int j = 0; j < slice_num_v_; j++)
+        {
+            int y = std::min(height - slice_height, std::max(0, j * (slice_height - overlap_height_pixel)));
+            int index = (i * slice_num_v_ + j) * 2;
+            slice_start_point_ptr[index] = x;
+            slice_start_point_ptr[index + 1] = y;
+        }
+    }
+    
+    checkRuntime(cudaMemcpyAsync(slice_start_point_.gpu(), slice_start_point_.cpu(), slice_num*2*sizeof(int), cudaMemcpyHostToDevice, stream_));
+    checkRuntime(cudaStreamSynchronize(stream_));
+    slice_plane(
+        input_device, output_device, slice_start_point_.gpu(),
+        width, height, 
+        slice_width, slice_height, 
+        slice_num_h_, slice_num_v_,
+        stream);
+
+    // checkRuntime(cudaStreamSynchronize(stream_));
+
+    // for (int i = 0; i < slice_num_h_; i++)
+    // {
+    //     for (int j = 0; j < slice_num_v_; j++)
+    //     {
+    //         int index = i * slice_num_v_ + j;
+    //         slice_position_[index*2]   = slice_start_point_ptr[index*2];
+    //         slice_position_[index*2+1] = slice_start_point_ptr[index*2+1];
+
+    //         // cv::Mat image = cv::Mat::zeros(slice_height, slice_width, CV_8UC3);
+    //         // uint8_t* output_img_data = image.ptr<uint8_t>();
+    //         // cudaMemcpyAsync(output_img_data, output_device+index*output_img_size, output_img_size*sizeof(uint8_t), cudaMemcpyDeviceToHost, stream_);
+    //         // checkRuntime(cudaStreamSynchronize(stream_));
+    //         // cv::imwrite(std::to_string(index) + ".png", image);
+    //     }
+    // }
+}
+
+}

+ 48 - 0
src/infer/slice/slice.hpp

@@ -0,0 +1,48 @@
+#ifndef SLICE_HPP__
+#define SLICE_HPP__
+
+#include "opencv2/opencv.hpp"
+#include "common/image.hpp"
+#include "common/memory.hpp"
+#include <vector>
+
+namespace slice
+{
+
+int calculateNumCuts(int dimension, int subDimension, float overlapRatio);
+
+class SliceImage{
+public:
+    tensor::Memory<unsigned char> input_image_;
+    tensor::Memory<unsigned char> output_images_;
+
+    tensor::Memory<int> slice_start_point_;
+
+    int slice_num_h_;
+    int slice_num_v_;
+
+    int slice_width_;
+    int slice_height_;
+
+    // std::vector<int> slice_position_;
+
+public:
+    void slice(
+        const tensor::Image& image, 
+        const int slice_width,
+        const int slice_height, 
+        const float overlap_width_ratio,
+        const float overlap_height_ratio,
+        void* stream=nullptr);
+    
+    void autoSlice(
+        const tensor::Image& image, 
+        void* stream=nullptr);
+};
+
+
+
+}
+
+
+#endif

+ 138 - 0
src/infer/trt/affine.cu

@@ -0,0 +1,138 @@
+#include "infer/trt/affine.hpp"
+#include "common/check.hpp"
+
+namespace affine
+{
+
+Norm Norm::mean_std(const float mean[3], const float std[3], float alpha,
+                    ChannelType channel_type) 
+{
+    Norm out;
+    out.type = NormType::MeanStd;
+    out.alpha = alpha;
+    out.channel_type = channel_type;
+    memcpy(out.mean, mean, sizeof(out.mean));
+    memcpy(out.std, std, sizeof(out.std));
+    return out;
+}
+
+Norm Norm::alpha_beta(float alpha, float beta, ChannelType channel_type) 
+{
+    Norm out;
+    out.type = NormType::AlphaBeta;
+    out.alpha = alpha;
+    out.beta = beta;
+    out.channel_type = channel_type;
+    return out;
+}
+
+Norm Norm::None() { return Norm(); }
+
+
+static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(
+    uint8_t *src, int src_line_size, int src_width, int src_height, float *dst, int dst_width,
+    int dst_height, uint8_t const_value_st, float *warp_affine_matrix_2_3, Norm norm) 
+{
+    int dx = blockDim.x * blockIdx.x + threadIdx.x;
+    int dy = blockDim.y * blockIdx.y + threadIdx.y;
+    if (dx >= dst_width || dy >= dst_height) return;
+
+    float m_x1 = warp_affine_matrix_2_3[0];
+    float m_y1 = warp_affine_matrix_2_3[1];
+    float m_z1 = warp_affine_matrix_2_3[2];
+    float m_x2 = warp_affine_matrix_2_3[3];
+    float m_y2 = warp_affine_matrix_2_3[4];
+    float m_z2 = warp_affine_matrix_2_3[5];
+
+    float src_x = m_x1 * dx + m_y1 * dy + m_z1;
+    float src_y = m_x2 * dx + m_y2 * dy + m_z2;
+    float c0, c1, c2;
+
+    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) 
+    {
+        // out of range
+        c0 = const_value_st;
+        c1 = const_value_st;
+        c2 = const_value_st;
+    } 
+    else 
+    {
+        int y_low = floorf(src_y);
+        int x_low = floorf(src_x);
+        int y_high = y_low + 1;
+        int x_high = x_low + 1;
+
+        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+        float ly = src_y - y_low;
+        float lx = src_x - x_low;
+        float hy = 1 - ly;
+        float hx = 1 - lx;
+        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+        uint8_t *v1 = const_value;
+        uint8_t *v2 = const_value;
+        uint8_t *v3 = const_value;
+        uint8_t *v4 = const_value;
+        if (y_low >= 0) 
+        {
+            if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3;
+
+            if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3;
+        }
+
+        if (y_high < src_height) 
+        {
+            if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3;
+
+            if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3;
+        }
+
+        // same to opencv
+        c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
+        c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
+        c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
+    }
+
+    if (norm.channel_type == ChannelType::SwapRB) 
+    {
+        float t = c2;
+        c2 = c0;
+        c0 = t;
+    }
+
+    if (norm.type == NormType::MeanStd) 
+    {
+        c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+        c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+        c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+    } 
+    else if (norm.type == NormType::AlphaBeta) 
+    {
+        c0 = c0 * norm.alpha + norm.beta;
+        c1 = c1 * norm.alpha + norm.beta;
+        c2 = c2 * norm.alpha + norm.beta;
+    }
+
+    int area = dst_width * dst_height;
+    float *pdst_c0 = dst + dy * dst_width + dx;
+    float *pdst_c1 = pdst_c0 + area;
+    float *pdst_c2 = pdst_c1 + area;
+    *pdst_c0 = c0;
+    *pdst_c1 = c1;
+    *pdst_c2 = c2;
+}
+
+void warp_affine_bilinear_and_normalize_plane(uint8_t *src, int src_line_size, int src_width,
+                                                    int src_height, float *dst, int dst_width,
+                                                    int dst_height, float *matrix_2_3,
+                                                    uint8_t const_value, const Norm &norm,
+                                                    cudaStream_t stream) 
+{
+    dim3 grid((dst_width + 31) / 32, (dst_height + 31) / 32);
+    dim3 block(32, 32);
+
+    checkKernel(warp_affine_bilinear_and_normalize_plane_kernel<<<grid, block, 0, stream>>>(
+        src, src_line_size, src_width, src_height, dst, dst_width, dst_height, const_value,
+        matrix_2_3, norm));
+}
+
+} // namespace affine

+ 112 - 0
src/infer/trt/affine.hpp

@@ -0,0 +1,112 @@
+#ifndef AFFINE_HPP__
+#define AFFINE_HPP__
+
+#include <memory>
+#include <cuda_runtime.h>
+
+namespace affine
+{
+
+enum class NormType : int { None = 0, MeanStd = 1, AlphaBeta = 2 };
+
+enum class ChannelType : int { None = 0, SwapRB = 1 };
+
+struct Norm 
+{
+    float mean[3];
+    float std[3];
+    float alpha, beta;
+    NormType type = NormType::None;
+    ChannelType channel_type = ChannelType::None;
+
+    // out = (x * alpha - mean) / std
+    static Norm mean_std(const float mean[3], const float std[3], float alpha = 1 / 255.0f,
+                        ChannelType channel_type = ChannelType::None);
+
+    // out = x * alpha + beta
+    static Norm alpha_beta(float alpha, float beta = 0, ChannelType channel_type = ChannelType::None);
+
+    // None
+    static Norm None();
+};
+
+struct ResizeMatrix 
+{
+    float i2d[6];  // image to dst(network), 2x3 matrix
+    float d2i[6];  // dst to image, 2x3 matrix
+
+    void compute(const std::tuple<int, int> &from, const std::tuple<int, int> &to) 
+    {
+        float scale_x = std::get<0>(to) / (float)std::get<0>(from);
+        float scale_y = std::get<1>(to) / (float)std::get<1>(from);
+        float scale = std::min(scale_x, scale_y);
+
+        // resize 
+        i2d[0] = scale_x;
+        i2d[1] = 0;
+        i2d[2] = 0;
+        i2d[3] = 0;
+        i2d[4] = scale_y;
+        i2d[5] = 0;
+
+
+        double D = i2d[0] * i2d[4] - i2d[1] * i2d[3];
+        D = D != 0. ? double(1.) / D : double(0.);
+        double A11 = i2d[4] * D, A22 = i2d[0] * D, A12 = -i2d[1] * D, A21 = -i2d[3] * D;
+        double b1 = -A11 * i2d[2] - A12 * i2d[5];
+        double b2 = -A21 * i2d[2] - A22 * i2d[5];
+
+        d2i[0] = A11;
+        d2i[1] = A12;
+        d2i[2] = b1;
+        d2i[3] = A21;
+        d2i[4] = A22;
+        d2i[5] = b2;
+    }
+};
+
+
+struct LetterBoxMatrix 
+{
+    float i2d[6];  // image to dst(network), 2x3 matrix
+    float d2i[6];  // dst to image, 2x3 matrix
+
+    void compute(const std::tuple<int, int> &from, const std::tuple<int, int> &to) 
+    {
+        float scale_x = std::get<0>(to) / (float)std::get<0>(from);
+        float scale_y = std::get<1>(to) / (float)std::get<1>(from);
+        float scale = std::min(scale_x, scale_y);
+
+        // letter box
+        i2d[0] = scale;
+        i2d[1] = 0;
+        i2d[2] = -scale * std::get<0>(from) * 0.5 + std::get<0>(to) * 0.5 + scale * 0.5 - 0.5;
+        i2d[3] = 0;
+        i2d[4] = scale;
+        i2d[5] = -scale * std::get<1>(from) * 0.5 + std::get<1>(to) * 0.5 + scale * 0.5 - 0.5;
+
+
+        double D = i2d[0] * i2d[4] - i2d[1] * i2d[3];
+        D = D != 0. ? double(1.) / D : double(0.);
+        double A11 = i2d[4] * D, A22 = i2d[0] * D, A12 = -i2d[1] * D, A21 = -i2d[3] * D;
+        double b1 = -A11 * i2d[2] - A12 * i2d[5];
+        double b2 = -A21 * i2d[2] - A22 * i2d[5];
+
+        d2i[0] = A11;
+        d2i[1] = A12;
+        d2i[2] = b1;
+        d2i[3] = A21;
+        d2i[4] = A22;
+        d2i[5] = b2;
+    }
+};
+
+void warp_affine_bilinear_and_normalize_plane(uint8_t *src, int src_line_size, int src_width,
+                                                int src_height, float *dst, int dst_width,
+                                                int dst_height, float *matrix_2_3,
+                                                uint8_t const_value, const Norm &norm,
+                                                cudaStream_t stream);
+
+}
+
+#endif // AFFINE_HPP__

+ 564 - 0
src/infer/trt/yolo.cu

@@ -0,0 +1,564 @@
+#include "model/yolo.hpp"
+#include <vector>
+#include <memory>
+#include "slice/slice.hpp"
+#include "model/affine.hpp"
+#include "common/check.hpp"
+
+#define GPU_BLOCK_THREADS 512
+
+namespace yolo
+{
+
+static const int NUM_BOX_ELEMENT = 8;  // left, top, right, bottom, confidence, class, keepflag, row_index(output)
+static const int MAX_IMAGE_BOXES = 1024 * 4;
+
+static const int KEY_POINT_NUM   = 17; // 关键点数量
+
+static dim3 grid_dims(int numJobs){
+  int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
+  return dim3(((numJobs + numBlockThreads - 1) / (float)numBlockThreads));
+}
+
+static dim3 block_dims(int numJobs){
+  return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
+}
+
+static __host__ __device__ void affine_project(float *matrix, float x, float y, float *ox, float *oy) 
+{
+    *ox = matrix[0] * x + matrix[1] * y + matrix[2];
+    *oy = matrix[3] * x + matrix[4] * y + matrix[5];
+}
+
+static __global__ void decode_kernel_v5(float *predict, int num_bboxes, int num_classes,
+                                              int output_cdim, float confidence_threshold,
+                                              float *invert_affine_matrix, float *parray, int *box_count,
+                                              int max_image_boxes, int start_x, int start_y) 
+{
+    int position = blockDim.x * blockIdx.x + threadIdx.x;
+    if (position >= num_bboxes) return;
+
+    float *pitem = predict + output_cdim * position;
+    float objectness = pitem[4];
+    if (objectness < confidence_threshold) return;
+
+    float *class_confidence = pitem + 5;
+    
+    float confidence = *class_confidence++;
+    int label = 0;
+    for (int i = 1; i < num_classes; ++i, ++class_confidence) 
+    {
+        if (*class_confidence > confidence) 
+        {
+            confidence = *class_confidence;
+            label = i;
+        }
+    }
+    confidence *= objectness;
+    if (confidence < confidence_threshold) return;
+    
+    int index = atomicAdd(box_count, 1);
+    if (index >= max_image_boxes) return;
+
+    float cx = *pitem++;
+    float cy = *pitem++;
+    float width = *pitem++;
+    float height = *pitem++;
+    float left = cx - width * 0.5f;
+    float top = cy - height * 0.5f;
+    float right = cx + width * 0.5f;
+    float bottom = cy + height * 0.5f;
+    affine_project(invert_affine_matrix, left, top, &left, &top);
+    affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
+
+    float *pout_item = parray + index * NUM_BOX_ELEMENT;
+    *pout_item++ = left + start_x;
+    *pout_item++ = top + start_y;
+    *pout_item++ = right + start_x;
+    *pout_item++ = bottom + start_y;
+    *pout_item++ = confidence;
+    *pout_item++ = label;
+    *pout_item++ = 1;  // 1 = keep, 0 = ignore
+    *pout_item++ = position;
+}
+
+static __global__ void decode_kernel_v8(float *predict, int num_bboxes, int num_classes,
+                                              int output_cdim, float confidence_threshold,
+                                              float *invert_affine_matrix, float *parray, int *box_count,
+                                              int max_image_boxes, int start_x, int start_y) 
+{
+    int position = blockDim.x * blockIdx.x + threadIdx.x;
+    if (position >= num_bboxes) return;
+
+    float *pitem = predict + output_cdim * position;
+    float *class_confidence = pitem + 4;
+    float confidence = *class_confidence++;
+    int label = 0;
+    for (int i = 1; i < num_classes; ++i, ++class_confidence) 
+    {
+        if (*class_confidence > confidence) 
+        {
+            confidence = *class_confidence;
+            label = i;
+        }
+    }
+    if (confidence < confidence_threshold) return;
+
+    int index = atomicAdd(box_count, 1);
+    if (index >= max_image_boxes) return;
+
+    float cx = *pitem++;
+    float cy = *pitem++;
+    float width = *pitem++;
+    float height = *pitem++;
+    float left = cx - width * 0.5f;
+    float top = cy - height * 0.5f;
+    float right = cx + width * 0.5f;
+    float bottom = cy + height * 0.5f;
+    affine_project(invert_affine_matrix, left, top, &left, &top);
+    affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
+
+    float *pout_item = parray + index * NUM_BOX_ELEMENT;
+    *pout_item++ = left + start_x;
+    *pout_item++ = top + start_y;
+    *pout_item++ = right + start_x;
+    *pout_item++ = bottom + start_y;
+    *pout_item++ = confidence;
+    *pout_item++ = label;
+    *pout_item++ = 1;  // 1 = keep, 0 = ignore
+    *pout_item++ = position;
+}
+
+static __global__ void decode_kernel_11pose(float *predict, int num_bboxes, int num_classes,
+    int output_cdim, float confidence_threshold,
+    float *invert_affine_matrix, float *parray,
+    int *box_count, int max_image_boxes, int start_x, int start_y) 
+{
+    int position = blockDim.x * blockIdx.x + threadIdx.x;
+    if (position >= num_bboxes) return;
+
+    float *pitem            = predict + output_cdim * position;
+    float *class_confidence = pitem + 4;
+    float *key_points       = pitem + 4 + num_classes;
+    float confidence        = *class_confidence++;
+
+    int label = 0;
+    for (int i = 1; i < num_classes; ++i, ++class_confidence) 
+    {
+        if (*class_confidence > confidence) 
+        {
+            confidence = *class_confidence;
+            label = i;
+        }
+    }
+    if (confidence < confidence_threshold) return;
+
+    int index = atomicAdd(box_count, 1);
+    if (index >= max_image_boxes) return;
+
+    float cx = *pitem++;
+    float cy = *pitem++;
+    float width = *pitem++;
+    float height = *pitem++;
+    float left = cx - width * 0.5f;
+    float top = cy - height * 0.5f;
+    float right = cx + width * 0.5f;
+    float bottom = cy + height * 0.5f;
+    affine_project(invert_affine_matrix, left, top, &left, &top);
+    affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
+
+    float *pout_item = parray + index * (NUM_BOX_ELEMENT + KEY_POINT_NUM * 3);
+    *pout_item++ = left + start_x;
+    *pout_item++ = top + start_y;
+    *pout_item++ = right + start_x;
+    *pout_item++ = bottom + start_y;
+    *pout_item++ = confidence;
+    *pout_item++ = label;
+    *pout_item++ = 1;  // 1 = keep, 0 = ignore
+    *pout_item++ = position;
+    for (int i = 0; i < KEY_POINT_NUM; i++)
+    {
+        float x = *key_points++;
+        float y = *key_points++;
+        affine_project(invert_affine_matrix, x, y, &x, &y);
+        float score  = *key_points++;
+        *pout_item++ = x + start_x;
+        *pout_item++ = y + start_y;
+        *pout_item++ = score;
+    }
+}
+
+
+static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft,
+                                float btop, float bright, float bbottom)
+{
+    float cleft = max(aleft, bleft);
+    float ctop = max(atop, btop);
+    float cright = min(aright, bright);
+    float cbottom = min(abottom, bbottom);
+
+    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
+    if (c_area == 0.0f) return 0.0f;
+
+    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
+    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
+    return c_area / (a_area + b_area - c_area);
+}
+
+
+static __global__ void fast_nms_kernel(float *bboxes, int* box_count, int max_image_boxes, float threshold) 
+{
+    int position = (blockDim.x * blockIdx.x + threadIdx.x);
+    int count = min((int)*box_count, MAX_IMAGE_BOXES);
+    if (position >= count) return;
+
+    // left, top, right, bottom, confidence, class, keepflag
+    float *pcurrent = bboxes + position * NUM_BOX_ELEMENT;
+    for (int i = 0; i < count; ++i) 
+    {
+        float *pitem = bboxes + i * NUM_BOX_ELEMENT;
+        if (i == position || pcurrent[5] != pitem[5]) continue;
+
+        if (pitem[4] >= pcurrent[4]) 
+        {
+            if (pitem[4] == pcurrent[4] && i < position) continue;
+
+            float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1],
+                                pitem[2], pitem[3]);
+
+            if (iou > threshold) 
+            {
+                pcurrent[6] = 0;  // 1=keep, 0=ignore
+                return;
+            }
+        }
+    }
+}
+
+
+static __global__ void fast_nms_pose_kernel(float *bboxes, int* box_count, int max_image_boxes, float threshold) 
+{
+    int position = (blockDim.x * blockIdx.x + threadIdx.x);
+    int count = min((int)*box_count, MAX_IMAGE_BOXES);
+    if (position >= count) return;
+
+    // left, top, right, bottom, confidence, class, keepflag
+    float *pcurrent = bboxes + position * (NUM_BOX_ELEMENT + KEY_POINT_NUM * 3);
+    for (int i = 0; i < count; ++i) 
+    {
+        float *pitem = bboxes + i * (NUM_BOX_ELEMENT + KEY_POINT_NUM * 3);
+        if (i == position || pcurrent[5] != pitem[5]) continue;
+
+        if (pitem[4] >= pcurrent[4]) 
+        {
+            if (pitem[4] == pcurrent[4] && i < position) continue;
+
+            float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1],
+                                pitem[2], pitem[3]);
+
+            if (iou > threshold) 
+            {
+                pcurrent[6] = 0;  // 1=keep, 0=ignore
+                return;
+            }
+        }
+    }
+}
+
+static void decode_kernel_invoker_v8(float *predict, int num_bboxes, int num_classes, int output_cdim,
+                                  float confidence_threshold, float nms_threshold,
+                                  float *invert_affine_matrix, float *parray, int* box_count, int max_image_boxes,
+                                  int start_x, int start_y, cudaStream_t stream) 
+{
+    auto grid = grid_dims(num_bboxes);
+    auto block = block_dims(num_bboxes);
+
+    checkKernel(decode_kernel_v8<<<grid, block, 0, stream>>>(
+            predict, num_bboxes, num_classes, output_cdim, confidence_threshold, invert_affine_matrix,
+            parray, box_count, max_image_boxes, start_x, start_y));
+}
+
+
+static void decode_kernel_invoker_v5(float *predict, int num_bboxes, int num_classes, int output_cdim,
+                                  float confidence_threshold, float nms_threshold,
+                                  float *invert_affine_matrix, float *parray, int* box_count, int max_image_boxes,
+                                  int start_x, int start_y, cudaStream_t stream) 
+{
+    auto grid = grid_dims(num_bboxes);
+    auto block = block_dims(num_bboxes);
+
+    checkKernel(decode_kernel_v5<<<grid, block, 0, stream>>>(
+            predict, num_bboxes, num_classes, output_cdim, confidence_threshold, invert_affine_matrix,
+            parray, box_count, max_image_boxes, start_x, start_y));
+}
+
+static void decode_kernel_invoker_v11pose(float *predict, int num_bboxes, int num_classes, int output_cdim,
+    float confidence_threshold, float nms_threshold,
+    float *invert_affine_matrix, float *parray, int* box_count, int max_image_boxes,
+    int start_x, int start_y, cudaStream_t stream) 
+{
+    auto grid = grid_dims(num_bboxes);
+    auto block = block_dims(num_bboxes);
+
+    checkKernel(decode_kernel_11pose<<<grid, block, 0, stream>>>(
+            predict, num_bboxes, num_classes, output_cdim, confidence_threshold, invert_affine_matrix,
+            parray, box_count, max_image_boxes, start_x, start_y));
+}
+
+static void fast_nms_kernel_invoker(float *parray, int* box_count, int max_image_boxes, float nms_threshold, cudaStream_t stream)
+{
+    auto grid = grid_dims(max_image_boxes);
+    auto block = block_dims(max_image_boxes);
+    checkKernel(fast_nms_kernel<<<grid, block, 0, stream>>>(parray, box_count, max_image_boxes, nms_threshold));
+}
+
+static void fast_nms_pose_kernel_invoker(float *parray, int* box_count, int max_image_boxes, float nms_threshold, cudaStream_t stream)
+{
+    auto grid = grid_dims(max_image_boxes);
+    auto block = block_dims(max_image_boxes);
+    checkKernel(fast_nms_pose_kernel<<<grid, block, 0, stream>>>(parray, box_count, max_image_boxes, nms_threshold));
+}
+
+void YoloModelImpl::adjust_memory(int batch_size)
+{
+    // the inference batch_size
+    size_t input_numel = network_input_width_ * network_input_height_ * 3;
+    input_buffer_.gpu(batch_size * input_numel);
+    bbox_predict_.gpu(batch_size * bbox_head_dims_[1] * bbox_head_dims_[2]);
+    output_boxarray_.gpu(MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
+    output_boxarray_.cpu(MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
+
+    affine_matrix_.gpu(6);
+    affine_matrix_.cpu(6);
+
+    box_count_.gpu(1);
+    box_count_.cpu(1);
+}
+
+void YoloModelImpl::preprocess(int ibatch, affine::LetterBoxMatrix &affine, void *stream = nullptr)
+{
+    affine.compute(std::make_tuple(slice_->slice_width_, slice_->slice_height_),
+                std::make_tuple(network_input_width_, network_input_height_));
+
+    size_t input_numel = network_input_width_ * network_input_height_ * 3;
+    float *input_device = input_buffer_.gpu() + ibatch * input_numel;
+    size_t size_image = slice_->slice_width_ * slice_->slice_height_ * 3;
+
+    float *affine_matrix_device = affine_matrix_.gpu();
+    uint8_t *image_device = slice_->output_images_.gpu() + ibatch * size_image;
+
+    float *affine_matrix_host = affine_matrix_.cpu();
+
+    // speed up
+    cudaStream_t stream_ = (cudaStream_t)stream;
+    memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
+    checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
+                                cudaMemcpyHostToDevice, stream_));
+
+    affine::warp_affine_bilinear_and_normalize_plane(image_device, slice_->slice_width_ * 3, slice_->slice_width_,
+                                            slice_->slice_height_, input_device, network_input_width_,
+                                            network_input_height_, affine_matrix_device, 114,
+                                            normalize_, stream_);
+}
+
+
+bool YoloModelImpl::load(const std::string &engine_file, ModelType model_type, float confidence_threshold, float nms_threshold)
+{
+    trt_ = TensorRT::load(engine_file);
+    if (trt_ == nullptr) return false;
+
+    trt_->print();
+
+    this->confidence_threshold_ = confidence_threshold;
+    this->nms_threshold_ = nms_threshold;
+    this->model_type_ = model_type;
+
+    auto input_dim = trt_->static_dims(0);
+    bbox_head_dims_ = trt_->static_dims(1);
+    network_input_width_ = input_dim[3];
+    network_input_height_ = input_dim[2];
+    isdynamic_model_ = trt_->has_dynamic_dim();
+
+    normalize_ = affine::Norm::alpha_beta(1 / 255.0f, 0.0f, affine::ChannelType::SwapRB);
+    if (this->model_type_ == ModelType::YOLOV8 || this->model_type_ == ModelType::YOLOV11)
+    {
+        num_classes_ = bbox_head_dims_[2] - 4;
+    }
+    else if (this->model_type_ == ModelType::YOLOV5)
+    {
+        num_classes_ = bbox_head_dims_[2] - 5;
+    }
+    else if (this->model_type_ == ModelType::YOLOV11POSE)
+    {
+        num_classes_ = bbox_head_dims_[2] - 4 - KEY_POINT_NUM * 3;
+        // NUM_BOX_ELEMENT = 8 + KEY_POINT_NUM * 3;
+    }
+    return true;
+}
+
+data::BoxArray YoloModelImpl::forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream = nullptr)
+{
+    slice_->slice(image, slice_width, slice_height, overlap_width_ratio, overlap_height_ratio, stream);
+    return forwards(stream);
+}
+
+data::BoxArray YoloModelImpl::forward(const tensor::Image &image, void *stream = nullptr)
+{
+    slice_->autoSlice(image, stream);
+    return forwards(stream);
+}
+
+data::BoxArray YoloModelImpl::forwards(void *stream = nullptr)
+{
+    int num_image = slice_->slice_num_h_ * slice_->slice_num_v_;
+    if (num_image == 0) return {};
+    
+    auto input_dims = trt_->static_dims(0);
+    int infer_batch_size = input_dims[0];
+    if (infer_batch_size != num_image) 
+    {
+        if (isdynamic_model_) 
+        {
+            infer_batch_size = num_image;
+            input_dims[0] = num_image;
+            if (!trt_->set_run_dims(0, input_dims)) 
+            {
+                printf("Fail to set run dims\n");
+                return {};
+            }
+        } 
+        else 
+        {
+            if (infer_batch_size < num_image) 
+            {
+                printf(
+                    "When using static shape model, number of images[%d] must be "
+                    "less than or equal to the maximum batch[%d].",
+                    num_image, infer_batch_size);
+                return {};
+            }
+        }
+    }
+    adjust_memory(infer_batch_size);
+
+    affine::LetterBoxMatrix affine_matrix;
+    cudaStream_t stream_ = (cudaStream_t)stream;
+    for (int i = 0; i < num_image; ++i)
+        preprocess(i, affine_matrix, stream);
+
+    float *bbox_output_device = bbox_predict_.gpu();
+    #ifdef TRT10
+    if (!trt_->forward(std::unordered_map<std::string, const void *>{
+            { "images", input_buffer_.gpu() }, 
+            { "output0", bbox_predict_.gpu() }
+        }, stream_))
+    {
+        printf("Failed to tensorRT forward.");
+        return {};
+    }
+    #else
+    std::vector<void *> bindings{input_buffer_.gpu(), bbox_output_device};
+    if (!trt_->forward(bindings, stream)) 
+    {
+        printf("Failed to tensorRT forward.");
+        return {};
+    }
+    #endif
+
+    int* box_count = box_count_.gpu();
+    checkRuntime(cudaMemsetAsync(box_count, 0, sizeof(int), stream_));
+    for (int ib = 0; ib < num_image; ++ib) 
+    {
+        int start_x = slice_->slice_start_point_.cpu()[ib*2];
+        int start_y = slice_->slice_start_point_.cpu()[ib*2+1];
+        // float *boxarray_device =
+        //     output_boxarray_.gpu() + ib * (MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
+        float *boxarray_device = output_boxarray_.gpu();
+        float *affine_matrix_device = affine_matrix_.gpu();
+        float *image_based_bbox_output =
+            bbox_output_device + ib * (bbox_head_dims_[1] * bbox_head_dims_[2]);
+        if (model_type_ == ModelType::YOLOV5)
+        {
+            decode_kernel_invoker_v5(image_based_bbox_output, bbox_head_dims_[1], num_classes_,
+                                bbox_head_dims_[2], confidence_threshold_, nms_threshold_,
+                                affine_matrix_device, boxarray_device, box_count, MAX_IMAGE_BOXES, start_x, start_y, stream_);
+        }
+        else if (model_type_ == ModelType::YOLOV8 || model_type_ == ModelType::YOLOV11)
+        {
+            decode_kernel_invoker_v8(image_based_bbox_output, bbox_head_dims_[1], num_classes_,
+                                bbox_head_dims_[2], confidence_threshold_, nms_threshold_,
+                                affine_matrix_device, boxarray_device, box_count, MAX_IMAGE_BOXES, start_x, start_y, stream_);
+        }
+        else if (model_type_ == ModelType::YOLOV11POSE)
+        {
+            decode_kernel_invoker_v11pose(image_based_bbox_output, bbox_head_dims_[1], num_classes_,
+                bbox_head_dims_[2], confidence_threshold_, nms_threshold_,
+                affine_matrix_device, boxarray_device, box_count, MAX_IMAGE_BOXES, start_x, start_y, stream_);
+        }
+        
+    }
+    float *boxarray_device =  output_boxarray_.gpu();
+    if (model_type_ == ModelType::YOLOV11POSE)
+    {
+        fast_nms_pose_kernel_invoker(boxarray_device, box_count, MAX_IMAGE_BOXES, nms_threshold_, stream_);
+    }
+    else
+    {
+        fast_nms_kernel_invoker(boxarray_device, box_count, MAX_IMAGE_BOXES, nms_threshold_, stream_);
+    }
+    
+    checkRuntime(cudaMemcpyAsync(output_boxarray_.cpu(), output_boxarray_.gpu(),
+                                output_boxarray_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
+    checkRuntime(cudaMemcpyAsync(box_count_.cpu(), box_count_.gpu(),
+                                box_count_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
+    checkRuntime(cudaStreamSynchronize(stream_));
+
+    data::BoxArray result;
+    // int imemory = 0;
+    float *parray = output_boxarray_.cpu();
+    int count = min(MAX_IMAGE_BOXES, *(box_count_.cpu()));
+
+    for (int i = 0; i < count; ++i) 
+    {
+        int box_element = (model_type_ == ModelType::YOLOV11POSE) ? (NUM_BOX_ELEMENT + KEY_POINT_NUM * 3) : NUM_BOX_ELEMENT;
+        float *pbox = parray + i * box_element;
+        int label = pbox[5];
+        int keepflag = pbox[6];
+        if (keepflag == 1) 
+        {
+            data::Box result_object_box(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], label);
+            if (model_type_ == ModelType::YOLOV11POSE)
+            {
+                result_object_box.pose.reserve(KEY_POINT_NUM);
+                for (int i = 0; i < KEY_POINT_NUM; i++)
+                {
+                    result_object_box.pose.emplace_back(pbox[8+i*3], pbox[8+i*3+1], pbox[8+i*3+2]);
+                }
+            }
+            
+            result.emplace_back(result_object_box);
+        }
+    }
+    return result;
+}
+
+Infer *loadraw(const std::string &engine_file, ModelType model_type, float confidence_threshold,
+               float nms_threshold) 
+{
+    YoloModelImpl *impl = new YoloModelImpl();
+    if (!impl->load(engine_file, yolo_type, confidence_threshold, nms_threshold)) 
+    {
+        delete impl;
+        impl = nullptr;
+    }
+    impl->slice_ = std::make_shared<slice::SliceImage>();
+    return impl;
+}
+
+std::shared_ptr<Infer> load_yolo(const std::string &engine_file, ModelType model_type, int gpu_id, float confidence_threshold, float nms_threshold) 
+{
+    checkRuntime(cudaSetDevice(gpu_id));
+    return std::shared_ptr<YoloModelImpl>((YoloModelImpl *)loadraw(engine_file, model_type, confidence_threshold, nms_threshold));
+}
+
+}

+ 74 - 0
src/infer/trt/yolo.hpp

@@ -0,0 +1,74 @@
+#ifndef YOLO_HPP__
+#define YOLO_HPP__
+#include <vector>
+#include <iomanip>
+#include "common/memory.hpp"
+#include "infer/infer.hpp"
+#include "common/image.hpp"
+#include "common/data.hpp"
+#include "infer/slice/slice.hpp"
+#include "infer/trt/affine.hpp"
+
+#ifdef TRT10
+#include "common/tensorrt.hpp"
+namespace TensorRT = TensorRT10;
+#else
+#include "common/tensorrt8.hpp"
+namespace TensorRT = TensorRT8;
+#endif
+
+namespace yolo
+{
+
+    class YoloModelImpl : public Infer 
+    {
+    public:
+        ModelType model_type_;
+    
+        // for sahi crop image
+        std::shared_ptr<slice::SliceImage> slice_;
+        std::shared_ptr<TensorRT::Engine> trt_;
+        std::string engine_file_;
+    
+        tensor::Memory<int> box_count_;
+    
+        tensor::Memory<float> affine_matrix_;
+        tensor::Memory<float>  input_buffer_, bbox_predict_, output_boxarray_;
+    
+        int network_input_width_, network_input_height_;
+        affine::Norm normalize_;
+        std::vector<int> bbox_head_dims_;
+        bool isdynamic_model_ = false;
+    
+        float confidence_threshold_;
+        float nms_threshold_;
+    
+        int num_classes_ = 0;
+
+        YoloModelImpl() = default;
+    
+        virtual ~YoloModelImpl() = default;
+    
+        void adjust_memory(int batch_size);
+    
+        void preprocess(int ibatch, affine::LetterBoxMatrix &affine, void *stream = nullptr);
+        
+    
+        bool load(const std::string &engine_file, ModelType model_type, float confidence_threshold, float nms_threshold);
+    
+        virtual data::BoxArray forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream = nullptr) override;
+    
+        virtual data::BoxArray forward(const tensor::Image &image, void *stream = nullptr) override;
+    
+        virtual data::BoxArray forwards(void *stream = nullptr) override;
+};
+
+Infer *loadraw(const std::string &engine_file, ModelType model_type, float confidence_threshold,
+    float nms_threshold);
+
+std::shared_ptr<Infer> load_yolo(const std::string &engine_file, ModelType model_type, int gpu_id, float confidence_threshold, float nms_threshold);
+
+} // namespace yolo
+
+#endif // YOLO_HPP__
+

+ 4 - 4
src/main.cpp

@@ -8,8 +8,8 @@
 
 int main()
 {
-    std::shared_ptr<Node::StreamNode> src_node   = std::make_shared<Node::StreamNode>("src", "rtsp://admin:lww123456@172.16.22.16:554/Streaming/Channels/101");
-    src_node->set_skip_frame(10);
+    std::shared_ptr<Node::StreamNode> src_node0   = std::make_shared<Node::StreamNode>("src0", "rtsp://admin:lww123456@172.16.22.16:554/Streaming/Channels/101");
+    src_node0->set_skip_frame(10);
 
     std::shared_ptr<Node::StreamNode> src_node1   = std::make_shared<Node::StreamNode>("src1", "rtsp://admin:lww123456@172.16.22.16:554/Streaming/Channels/201");
     src_node1->set_skip_frame(10);
@@ -21,7 +21,7 @@ int main()
     std::shared_ptr<Node::InferNode> infer_node   = std::make_shared<Node::InferNode>("infer");
     std::shared_ptr<Node::DrawNode> draw_node     = std::make_shared<Node::DrawNode>("draw");
     std::shared_ptr<Node::HttpPushNode> push_node = std::make_shared<Node::HttpPushNode>("push", "172.16.20.168", 8080, "/push");
-    Node::LinkNode(src_node, infer_node);
+    Node::LinkNode(src_node0, infer_node);
     Node::LinkNode(src_node1, infer_node);
     Node::LinkNode(src_node2, infer_node);
     Node::LinkNode(infer_node, draw_node);
@@ -29,7 +29,7 @@ int main()
     push_node->start();
     draw_node->start();
     infer_node->start();
-    src_node->start();
+    src_node0->start();
     src_node1->start();
     src_node2->start();
     getchar();

+ 4 - 5
src/nodes/infer/inferNode.hpp

@@ -3,7 +3,7 @@
 
 #include "nodes/base/base.hpp"
 #include <opencv2/opencv.hpp>
-
+#include "infer/infer.hpp"
 namespace Node
 {
 
@@ -12,18 +12,17 @@ class InferNode : public BaseNode
 public:
     InferNode() = delete;
     InferNode(const std::string& name) : BaseNode(name, NODE_TYPE::MID_NODE) {}
-    InferNode(const std::string& name, const std::string& model_path) : BaseNode(name, NODE_TYPE::MID_NODE), model_path_(model_path){}
     virtual ~InferNode() { };
 
-    void set_model_path(const std::string& model_path)
+    void set_model_instance(std::shared_ptr<Infer> model)
     {
-        model_path_ = model_path;
+        model_ = infer;
     }
 
     void work() override;
 
 private:
-    std::string model_path_;
+    std::shared_ptr<Infer> model_ = nullptr;
 };
 
 }

+ 0 - 29
src/pipeline/pipiline.hpp

@@ -1,29 +0,0 @@
-#ifndef PIPELINE_HPP__
-#define PIPELINE_HPP__
-
-#include <string>
-
-
-namespace pip
-{
-
-class Pipeline
-{
-public:
-    Pipeline() = delete;
-    Pipeline(const std::string& name) : name_(name) {}
-
-    void start();
-    void stop();
-
-private:
-    std::string name_;
-    bool running_ = false;
-};
-
-
-void create_pipeline();
-
-}
-
-#endif // PIPELINE_HPP__

+ 5 - 0
src/stream/stream.hpp

@@ -0,0 +1,5 @@
+#ifndef STREAM_HPP__
+#define STREAM_HPP__
+
+
+#endif  // STREAM_HPP__