5 months ago · 1781d2cfcf
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -55,6 +55,7 @@
 
				         "algorithm": "cpp",
			
 
				         "tuple": "cpp",
			
 
				         "future": "cpp",
			
 
				-        "iomanip": "cpp"
			
 
				+        "iomanip": "cpp",
			
 
				+        "fstream": "cpp"
			
 
				     }
			
 
				 }
			
--- a/Makefile
+++ b/Makefile
@@ -41,7 +41,7 @@ link_trt          := nvinfer nvinfer_plugin nvonnxparser
 
				 link_cuda         := cuda cublas cudart cudnn
			
 
				 link_sys          := stdc++ dl
			
 
				 
			
 
				-link_librarys     := $(link_opencv) $(link_cuda) $(link_sys)
			
 
				+link_librarys     := $(link_opencv) $(link_trt) $(link_cuda) $(link_sys)
			
 
				 
			
 
				 
			
 
				 empty := 
			
--- a/src/common/check.hpp
+++ b/src/common/check.hpp
@@ -0,0 +1,124 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: MIT
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+ #ifndef __CHECK_HPP__
			
 
				+ #define __CHECK_HPP__
			
 
				+ 
			
 
				+ #include <assert.h>
			
 
				+ #include <cuda_runtime.h>
			
 
				+ #include <stdarg.h>
			
 
				+ #include <stdio.h>
			
 
				+ 
			
 
				+ #include <string>
			
 
				+ 
			
 
				+ namespace nv
			
 
				+ {
			
 
				+ 
			
 
				+ #define NVUNUSED2(a, b) \
			
 
				+     {                   \
			
 
				+         (void)(a);      \
			
 
				+         (void)(b);      \
			
 
				+     }
			
 
				+ #define NVUNUSED(a) \
			
 
				+     {               \
			
 
				+         (void)(a);  \
			
 
				+     }
			
 
				+ 
			
 
				+ #if DEBUG
			
 
				+ #define checkRuntime(call) nv::check_runtime(call, #call, __LINE__, __FILE__)
			
 
				+ #define checkKernel(...)                                                                \
			
 
				+     [&]                                                                                 \
			
 
				+     {                                                                                   \
			
 
				+         __VA_ARGS__;                                                                    \
			
 
				+         checkRuntime(cudaStreamSynchronize(nullptr));                                   \
			
 
				+         return nv::check_runtime(cudaGetLastError(), #__VA_ARGS__, __LINE__, __FILE__); \
			
 
				+     }()
			
 
				+ #define dprintf printf
			
 
				+ #else
			
 
				+ #define checkRuntime(call) nv::check_runtime(call, #call, __LINE__, __FILE__)
			
 
				+ #define checkKernel(...)                                                            \
			
 
				+     do                                                                              \
			
 
				+     {                                                                               \
			
 
				+         __VA_ARGS__;                                                                \
			
 
				+         nv::check_runtime(cudaPeekAtLastError(), #__VA_ARGS__, __LINE__, __FILE__); \
			
 
				+     } while (false)
			
 
				+ #define dprintf(...)
			
 
				+ #endif
			
 
				+ 
			
 
				+ #define Assertf(cond, fmt, ...)                                               \
			
 
				+     do                                                                        \
			
 
				+     {                                                                         \
			
 
				+         if (!(cond))                                                          \
			
 
				+         {                                                                     \
			
 
				+             fprintf(stderr,                                                   \
			
 
				+                     "Assert failed 💀. %s in file %s:%d, message: " fmt "\n", \
			
 
				+                     #cond,                                                    \
			
 
				+                     __FILE__,                                                 \
			
 
				+                     __LINE__,                                                 \
			
 
				+                     __VA_ARGS__);                                             \
			
 
				+             abort();                                                          \
			
 
				+         }                                                                     \
			
 
				+     } while (false)
			
 
				+ 
			
 
				+ #define Asserts(cond, s)                                                                                        \
			
 
				+     do                                                                                                          \
			
 
				+     {                                                                                                           \
			
 
				+         if (!(cond))                                                                                            \
			
 
				+         {                                                                                                       \
			
 
				+             fprintf(stderr, "Assert failed 💀. %s in file %s:%d, message: " s "\n", #cond, __FILE__, __LINE__); \
			
 
				+             abort();                                                                                            \
			
 
				+         }                                                                                                       \
			
 
				+     } while (false)
			
 
				+ 
			
 
				+ #define Assert(cond)                                                                            \
			
 
				+     do                                                                                          \
			
 
				+     {                                                                                           \
			
 
				+         if (!(cond))                                                                            \
			
 
				+         {                                                                                       \
			
 
				+             fprintf(stderr, "Assert failed 💀. %s in file %s:%d\n", #cond, __FILE__, __LINE__); \
			
 
				+             abort();                                                                            \
			
 
				+         }                                                                                       \
			
 
				+     } while (false)
			
 
				+ 
			
 
				+ static inline bool check_runtime(cudaError_t e, const char *call, int line, const char *file)
			
 
				+ {
			
 
				+     if (e != cudaSuccess)
			
 
				+     {
			
 
				+         fprintf(stderr,
			
 
				+                 "CUDA Runtime error %s # %s, code = %s [ %d ] in file "
			
 
				+                 "%s:%d\n",
			
 
				+                 call,
			
 
				+                 cudaGetErrorString(e),
			
 
				+                 cudaGetErrorName(e),
			
 
				+                 e,
			
 
				+                 file,
			
 
				+                 line);
			
 
				+         abort();
			
 
				+         return false;
			
 
				+     }
			
 
				+     return true;
			
 
				+ }
			
 
				+ 
			
 
				+ }; // namespace nv
			
 
				+ 
			
 
				+ #endif // __CHECK_HPP__
			
--- a/src/common/data.hpp
+++ b/src/common/data.hpp
@@ -32,18 +32,17 @@ struct Point
 
				 
			
 
				 struct Box
			
 
				 {
			
 
				-    float left;
			
 
				-    float top;
			
 
				-    float right;
			
 
				-    float bottom;
			
 
				-    float score;
			
 
				+    float left, top, right, bottom, score;
			
 
				+    int class_id;
			
 
				     std::string label;
			
 
				     std::vector<Point> points;
			
 
				-    Box() : left(0), top(0), right(0), bottom(0), score(0), label("") {}
			
 
				-    Box(float left, float top, float right, float bottom, float score, const std::string& label) 
			
 
				-        : left(left), top(top), right(right), bottom(bottom), score(score), label(label) {}
			
 
				-    Box(const Box& b) : left(b.left), top(b.top), right(b.right), bottom(b.bottom), score(b.score), label(b.label), points(b.points) {}
			
 
				-    Box(const Box&& b) : left(b.left), top(b.top), right(b.right), bottom(b.bottom), score(b.score), label(b.label), points(b.points) {}
			
 
				+    Box() : left(0), top(0), right(0), bottom(0), score(0), class_id(0), label("") {}
			
 
				+    Box(float left, float top, float right, float bottom, float score, int class_id) 
			
 
				+        : left(left), top(top), right(right), bottom(bottom), score(score), class_id(class_id), label("") {}
			
 
				+    Box(float left, float top, float right, float bottom, float score, int class_id,  const std::string& label) 
			
 
				+        : left(left), top(top), right(right), bottom(bottom), score(score), class_id(class_id), label(label) {}
			
 
				+    Box(const Box& b) : left(b.left), top(b.top), right(b.right), bottom(b.bottom), score(b.score), class_id(b.class_id), label(b.label), points(b.points) {}
			
 
				+    Box(const Box&& b) : left(b.left), top(b.top), right(b.right), bottom(b.bottom), score(b.score), class_id(b.class_id), label(b.label), points(b.points) {}
			
 
				     Box& operator=(const Box& b)
			
 
				     {
			
 
				         left = b.left;
			
@@ -51,6 +50,7 @@ struct Box
 
				         right = b.right;
			
 
				         bottom = b.bottom;
			
 
				         score = b.score;
			
 
				+        class_id = b.class_id;
			
 
				         label = b.label;
			
 
				         points = b.points;
			
 
				         return *this;
			
--- a/src/common/image.cpp
+++ b/src/common/image.cpp
@@ -0,0 +1,8 @@
 
				+#include "common/image.hpp"
			
 
				+
			
 
				+namespace tensor
			
 
				+{
			
 
				+
			
 
				+tensor::Image cvimg(const cv::Mat &image) { return Image(image.data, image.cols, image.rows); }
			
 
				+
			
 
				+} // namespace tensor
			
--- a/src/common/image.hpp
+++ b/src/common/image.hpp
@@ -0,0 +1,21 @@
 
				+#ifndef __IMAGE_HPP__
			
 
				+#define __IMAGE_HPP__
			
 
				+#include "opencv2/opencv.hpp"
			
 
				+
			
 
				+namespace tensor
			
 
				+{
			
 
				+
			
 
				+struct Image
			
 
				+{
			
 
				+    const void *bgrptr = nullptr;
			
 
				+    int width = 0, height = 0;
			
 
				+
			
 
				+    Image() = default;
			
 
				+    Image(const void *bgrptr, int width, int height) : bgrptr(bgrptr), width(width), height(height) {}
			
 
				+};
			
 
				+
			
 
				+Image cvimg(const cv::Mat &image);
			
 
				+
			
 
				+} // namespace tensor
			
 
				+
			
 
				+#endif
			
--- a/src/common/memory.cu
+++ b/src/common/memory.cu
@@ -0,0 +1,112 @@
 
				+#include "common/check.hpp"
			
 
				+#include "common/memory.hpp"
			
 
				+#include <cuda_runtime.h>
			
 
				+
			
 
				+namespace tensor
			
 
				+{
			
 
				+
			
 
				+using namespace std;
			
 
				+
			
 
				+static size_t upbound(size_t n, size_t align) { return (n + align - 1) / align * align; }
			
 
				+
			
 
				+BaseMemory::BaseMemory(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes)
			
 
				+{
			
 
				+    reference(cpu, cpu_bytes, gpu, gpu_bytes);
			
 
				+}
			
 
				+
			
 
				+void BaseMemory::reference(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes)
			
 
				+{
			
 
				+    release();
			
 
				+
			
 
				+    if (cpu == nullptr || cpu_bytes == 0)
			
 
				+    {
			
 
				+        cpu       = nullptr;
			
 
				+        cpu_bytes = 0;
			
 
				+    }
			
 
				+
			
 
				+    if (gpu == nullptr || gpu_bytes == 0)
			
 
				+    {
			
 
				+        gpu       = nullptr;
			
 
				+        gpu_bytes = 0;
			
 
				+    }
			
 
				+
			
 
				+    this->cpu_          = cpu;
			
 
				+    this->cpu_capacity_ = cpu_bytes;
			
 
				+    this->cpu_bytes_    = cpu_bytes;
			
 
				+    this->gpu_          = gpu;
			
 
				+    this->gpu_capacity_ = gpu_bytes;
			
 
				+    this->gpu_bytes_    = gpu_bytes;
			
 
				+
			
 
				+    this->owner_cpu_ = !(cpu && cpu_bytes > 0);
			
 
				+    this->owner_gpu_ = !(gpu && gpu_bytes > 0);
			
 
				+}
			
 
				+
			
 
				+BaseMemory::~BaseMemory() { release(); }
			
 
				+
			
 
				+void *BaseMemory::gpu_realloc(size_t bytes)
			
 
				+{
			
 
				+    // 内存对齐
			
 
				+    size_t size = upbound(bytes, 32);
			
 
				+    if (gpu_capacity_ < size)
			
 
				+    {
			
 
				+        release_gpu();
			
 
				+
			
 
				+        gpu_capacity_ = size;
			
 
				+        checkRuntime(cudaMalloc(&gpu_, size));
			
 
				+        // checkRuntime(cudaMemset(gpu_, 0, size));
			
 
				+    }
			
 
				+    gpu_bytes_ = bytes;
			
 
				+    return gpu_;
			
 
				+}
			
 
				+
			
 
				+void *BaseMemory::cpu_realloc(size_t bytes)
			
 
				+{
			
 
				+    size_t size = upbound(bytes, 32);
			
 
				+    if (cpu_capacity_ < size)
			
 
				+    {
			
 
				+        release_cpu();
			
 
				+
			
 
				+        cpu_capacity_ = size;
			
 
				+        checkRuntime(cudaMallocHost(&cpu_, size));
			
 
				+        Assert(cpu_ != nullptr);
			
 
				+        // memset(cpu_, 0, size);
			
 
				+    }
			
 
				+    cpu_bytes_ = bytes;
			
 
				+    return cpu_;
			
 
				+}
			
 
				+
			
 
				+void BaseMemory::release_cpu()
			
 
				+{
			
 
				+    if (cpu_)
			
 
				+    {
			
 
				+        if (owner_cpu_)
			
 
				+        {
			
 
				+            checkRuntime(cudaFreeHost(cpu_));
			
 
				+        }
			
 
				+        cpu_ = nullptr;
			
 
				+    }
			
 
				+    cpu_capacity_ = 0;
			
 
				+    cpu_bytes_    = 0;
			
 
				+}
			
 
				+
			
 
				+void BaseMemory::release_gpu()
			
 
				+{
			
 
				+    if (gpu_)
			
 
				+    {
			
 
				+        if (owner_gpu_)
			
 
				+        {
			
 
				+            checkRuntime(cudaFree(gpu_));
			
 
				+        }
			
 
				+        gpu_ = nullptr;
			
 
				+    }
			
 
				+    gpu_capacity_ = 0;
			
 
				+    gpu_bytes_    = 0;
			
 
				+}
			
 
				+
			
 
				+void BaseMemory::release()
			
 
				+{
			
 
				+    release_cpu();
			
 
				+    release_gpu();
			
 
				+}
			
 
				+
			
 
				+} // namespace tensor
			
--- a/src/common/memory.hpp
+++ b/src/common/memory.hpp
@@ -0,0 +1,61 @@
 
				+#ifndef __MEMORY_HPP__
			
 
				+#define __MEMORY_HPP__
			
 
				+
			
 
				+#include <initializer_list>
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace tensor
			
 
				+{
			
 
				+
			
 
				+class BaseMemory
			
 
				+{
			
 
				+  public:
			
 
				+    BaseMemory() = default;
			
 
				+    BaseMemory(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes);
			
 
				+    virtual ~BaseMemory();
			
 
				+    virtual void *gpu_realloc(size_t bytes);
			
 
				+    virtual void *cpu_realloc(size_t bytes);
			
 
				+    void release_gpu();
			
 
				+    void release_cpu();
			
 
				+    void release();
			
 
				+    inline bool owner_gpu() const { return owner_gpu_; }
			
 
				+    inline bool owner_cpu() const { return owner_cpu_; }
			
 
				+    inline size_t cpu_bytes() const { return cpu_bytes_; }
			
 
				+    inline size_t gpu_bytes() const { return gpu_bytes_; }
			
 
				+    virtual inline void *get_gpu() const { return gpu_; }
			
 
				+    virtual inline void *get_cpu() const { return cpu_; }
			
 
				+    void reference(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes);
			
 
				+
			
 
				+  protected:
			
 
				+    void *cpu_           = nullptr;
			
 
				+    size_t cpu_bytes_    = 0;
			
 
				+    size_t cpu_capacity_ = 0;
			
 
				+    bool owner_cpu_      = true;
			
 
				+
			
 
				+    void *gpu_           = nullptr;
			
 
				+    size_t gpu_bytes_    = 0;
			
 
				+    size_t gpu_capacity_ = 0;
			
 
				+    bool owner_gpu_      = true;
			
 
				+};
			
 
				+
			
 
				+template <typename _DT> class Memory : public BaseMemory
			
 
				+{
			
 
				+  public:
			
 
				+    Memory()                               = default;
			
 
				+    Memory(const Memory &other)            = delete;
			
 
				+    Memory &operator=(const Memory &other) = delete;
			
 
				+    virtual _DT *gpu(size_t size) { return (_DT *)BaseMemory::gpu_realloc(size * sizeof(_DT)); }
			
 
				+    virtual _DT *cpu(size_t size) { return (_DT *)BaseMemory::cpu_realloc(size * sizeof(_DT)); }
			
 
				+
			
 
				+    inline size_t cpu_size() const { return cpu_bytes_ / sizeof(_DT); }
			
 
				+    inline size_t gpu_size() const { return gpu_bytes_ / sizeof(_DT); }
			
 
				+
			
 
				+    virtual inline _DT *gpu() const { return (_DT *)gpu_; }
			
 
				+    virtual inline _DT *cpu() const { return (_DT *)cpu_; }
			
 
				+};
			
 
				+
			
 
				+} // namespace tensor
			
 
				+
			
 
				+#endif
			
--- a/src/common/meta.hpp
+++ b/src/common/meta.hpp
@@ -9,11 +9,12 @@ namespace meta
 
				 {
			
 
				 
			
 
				 struct MetaData{
			
 
				-    std::string from;
			
 
				-    cv::Mat image;
			
 
				-    cv::Mat draw_image;
			
 
				-    data::BoxArray boxes;
			
 
				-    data::BoxArray result;
			
 
				+    std::string from; // 图片来源
			
 
				+    cv::Mat image; // 原始图片
			
 
				+    cv::Mat draw_image; // 画框图
			
 
				+    cv::Mat depth; // 深度图
			
 
				+    data::BoxArray boxes; // 目标检测识别结果
			
 
				+    data::BoxArray result; // 分析结果
			
 
				 };
			
 
				 
			
 
				 }
			
--- a/src/common/tensorrt.cpp
+++ b/src/common/tensorrt.cpp
@@ -0,0 +1,357 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: MIT
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+ #include "tensorrt.hpp"
			
 
				+
			
 
				+ #include <cuda_runtime.h>
			
 
				+ #include <string.h>
			
 
				+ 
			
 
				+ #include <algorithm>
			
 
				+ #include <fstream>
			
 
				+ #include <iostream>
			
 
				+ #include <numeric>
			
 
				+ #include <unordered_map>
			
 
				+ #include <vector>
			
 
				+ 
			
 
				+ #include "NvInfer.h"
			
 
				+ #include "NvInferRuntime.h"
			
 
				+ #include "check.hpp"
			
 
				+ 
			
 
				+ namespace TensorRT10 {
			
 
				+ 
			
 
				+ static class Logger : public nvinfer1::ILogger {
			
 
				+  public:
			
 
				+   void log(Severity severity, const char *msg) noexcept override {
			
 
				+     if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) {
			
 
				+       std::cerr << "[NVINFER LOG]: " << msg << std::endl;
			
 
				+     }
			
 
				+   }
			
 
				+ } gLogger_;
			
 
				+ 
			
 
				+ static std::string format_shape(const nvinfer1::Dims &shape) {
			
 
				+   char buf[200] = {0};
			
 
				+   char *p = buf;
			
 
				+   for (int i = 0; i < shape.nbDims; ++i) {
			
 
				+     if (i + 1 < shape.nbDims)
			
 
				+       p += sprintf(p, "%d x ", (int)shape.d[i]);
			
 
				+     else
			
 
				+       p += sprintf(p, "%d", (int)shape.d[i]);
			
 
				+   }
			
 
				+   return buf;
			
 
				+ }
			
 
				+ 
			
 
				+ static std::vector<uint8_t> load_file(const std::string &file) {
			
 
				+   std::ifstream in(file, std::ios::in | std::ios::binary);
			
 
				+   if (!in.is_open()) return {};
			
 
				+ 
			
 
				+   in.seekg(0, std::ios::end);
			
 
				+   size_t length = in.tellg();
			
 
				+ 
			
 
				+   std::vector<uint8_t> data;
			
 
				+   if (length > 0) {
			
 
				+     in.seekg(0, std::ios::beg);
			
 
				+     data.resize(length);
			
 
				+ 
			
 
				+     in.read((char *)&data[0], length);
			
 
				+   }
			
 
				+   in.close();
			
 
				+   return data;
			
 
				+ }
			
 
				+ 
			
 
				+ static const char *data_type_string(nvinfer1::DataType dt) {
			
 
				+   switch (dt) {
			
 
				+     case nvinfer1::DataType::kFLOAT:
			
 
				+       return "float32";
			
 
				+     case nvinfer1::DataType::kHALF:
			
 
				+       return "float16";
			
 
				+     case nvinfer1::DataType::kINT8:
			
 
				+       return "int8";
			
 
				+     case nvinfer1::DataType::kINT32: 
			
 
				+       return "int32";
			
 
				+     case nvinfer1::DataType::kBOOL: 
			
 
				+       return "bool";
			
 
				+     case nvinfer1::DataType::kUINT8: 
			
 
				+       return "uint8";
			
 
				+ 
			
 
				+     #if NV_TENSORRT_MAJOR >= 10
			
 
				+       case nvinfer1::DataType::kFP8: 
			
 
				+         return "fp8";
			
 
				+       case nvinfer1::DataType::kBF16: 
			
 
				+         return "bf16";
			
 
				+       case nvinfer1::DataType::kINT64: 
			
 
				+         return "int64";
			
 
				+       case nvinfer1::DataType::kINT4:
			
 
				+         return "int4";
			
 
				+     #endif
			
 
				+ 
			
 
				+     default:
			
 
				+       return "Unknow";
			
 
				+   }
			
 
				+ }
			
 
				+ 
			
 
				+ template <typename _T>
			
 
				+ static void destroy_pointer(_T *ptr) {
			
 
				+   if (ptr) delete ptr;
			
 
				+ }
			
 
				+ 
			
 
				+ class __native_engine_context {
			
 
				+  public:
			
 
				+   virtual ~__native_engine_context() { destroy(); }
			
 
				+ 
			
 
				+   bool construct(const void *pdata, size_t size, const char *message_name) {
			
 
				+     destroy();
			
 
				+ 
			
 
				+     if (pdata == nullptr || size == 0) {
			
 
				+       printf("Construct for empty data found.\n");
			
 
				+       return false;
			
 
				+     }
			
 
				+ 
			
 
				+     runtime_ = std::shared_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(gLogger_), destroy_pointer<nvinfer1::IRuntime>);
			
 
				+     if (runtime_ == nullptr) {
			
 
				+       printf("Failed to create tensorRT runtime: %s.\n", message_name);
			
 
				+       return false;
			
 
				+     }
			
 
				+ 
			
 
				+     engine_ = std::shared_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(pdata, size),
			
 
				+                                                      destroy_pointer<nvinfer1::ICudaEngine>);
			
 
				+ 
			
 
				+     if (engine_ == nullptr) {
			
 
				+       printf("Failed to deserialize engine: %s\n", message_name);
			
 
				+       return false;
			
 
				+     }
			
 
				+ 
			
 
				+     context_ = std::shared_ptr<nvinfer1::IExecutionContext>(engine_->createExecutionContext(),
			
 
				+                                                             destroy_pointer<nvinfer1::IExecutionContext>);
			
 
				+     if (context_ == nullptr) {
			
 
				+       printf("Failed to create execution context: %s\n", message_name);
			
 
				+       return false;
			
 
				+     }
			
 
				+     return context_ != nullptr;
			
 
				+   }
			
 
				+ 
			
 
				+  private:
			
 
				+   void destroy() {
			
 
				+     context_.reset();
			
 
				+     engine_.reset();
			
 
				+     runtime_.reset();
			
 
				+   }
			
 
				+ 
			
 
				+  public:
			
 
				+   std::shared_ptr<nvinfer1::IExecutionContext> context_;
			
 
				+   std::shared_ptr<nvinfer1::ICudaEngine> engine_;
			
 
				+   std::shared_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
			
 
				+ };
			
 
				+ 
			
 
				+ class EngineImplement : public Engine {
			
 
				+  public:
			
 
				+   std::shared_ptr<__native_engine_context> context_;
			
 
				+   std::unordered_map<std::string, int> binding_name_to_index_;
			
 
				+ 
			
 
				+   virtual ~EngineImplement() = default;
			
 
				+ 
			
 
				+   bool construct(const void *data, size_t size, const char *message_name) {
			
 
				+     context_ = std::make_shared<__native_engine_context>();
			
 
				+     if (!context_->construct(data, size, message_name)) {
			
 
				+       return false;
			
 
				+     }
			
 
				+ 
			
 
				+     setup();
			
 
				+     return true;
			
 
				+   }
			
 
				+ 
			
 
				+   bool load(const std::string &file) {
			
 
				+     auto data = load_file(file);
			
 
				+     if (data.empty()) {
			
 
				+       printf("An empty file has been loaded. Please confirm your file path: %s\n", file.c_str());
			
 
				+       return false;
			
 
				+     }
			
 
				+     return this->construct(data.data(), data.size(), file.c_str());
			
 
				+   }
			
 
				+ 
			
 
				+   void setup() {
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     int nbBindings = engine->getNbIOTensors();
			
 
				+ 
			
 
				+     binding_name_to_index_.clear();
			
 
				+     for (int i = 0; i < nbBindings; ++i) {
			
 
				+       const char *bindingName = engine->getIOTensorName(i);
			
 
				+       binding_name_to_index_[bindingName] = i;
			
 
				+     }
			
 
				+   }
			
 
				+ 
			
 
				+   virtual int index(const std::string &name) override {
			
 
				+     auto iter = binding_name_to_index_.find(name);
			
 
				+     Assertf(iter != binding_name_to_index_.end(), "Can not found the binding name: %s", name.c_str());
			
 
				+     return iter->second;
			
 
				+   }
			
 
				+ 
			
 
				+   virtual bool forward(const std::unordered_map<std::string, const void *> &bindings, void *stream, void *input_consum_event) override {
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     auto context = this->context_->context_;
			
 
				+     int ibinding = 0;
			
 
				+     for(; ibinding < engine->getNbIOTensors(); ++ibinding){
			
 
				+       auto tensor_name = engine->getIOTensorName(ibinding);
			
 
				+       auto binding_iter = bindings.find(tensor_name);
			
 
				+       if(binding_iter == bindings.end()){
			
 
				+         printf("Failed to set the tensor address, can not found tensor %s in bindings provided.", tensor_name);
			
 
				+         return false;
			
 
				+       }
			
 
				+ 
			
 
				+       if(!context->setTensorAddress(tensor_name, (void*)binding_iter->second)){
			
 
				+         printf("Failed to set tensor address for tensor %s\n", tensor_name);
			
 
				+         return false;
			
 
				+       }
			
 
				+     }
			
 
				+     return context->enqueueV3((cudaStream_t)stream);
			
 
				+   }
			
 
				+ 
			
 
				+   virtual std::vector<int> run_dims(const std::string &name) override { return run_dims(index(name)); }
			
 
				+ 
			
 
				+   virtual std::vector<int> run_dims(int ibinding) override {
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     auto context = this->context_->context_;
			
 
				+     auto dim = context->getTensorShape(engine->getIOTensorName(ibinding));
			
 
				+     return std::vector<int>(dim.d, dim.d + dim.nbDims);
			
 
				+   }
			
 
				+ 
			
 
				+   virtual std::vector<int> static_dims(const std::string &name) override { return static_dims(index(name)); }
			
 
				+ 
			
 
				+   virtual std::vector<int> static_dims(int ibinding) override {
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     auto dim = engine->getTensorShape(engine->getIOTensorName(ibinding));
			
 
				+     return std::vector<int>(dim.d, dim.d + dim.nbDims);
			
 
				+   }
			
 
				+ 
			
 
				+   virtual int num_bindings() override { return this->context_->engine_->getNbIOTensors(); }
			
 
				+ 
			
 
				+   virtual bool is_input(int ibinding) override { 
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     return engine->getTensorIOMode(engine->getIOTensorName(ibinding)) == nvinfer1::TensorIOMode::kINPUT; 
			
 
				+   }
			
 
				+ 
			
 
				+   virtual bool is_input(const std::string &name) override { 
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     return engine->getTensorIOMode(name.c_str()) == nvinfer1::TensorIOMode::kINPUT; 
			
 
				+   }
			
 
				+ 
			
 
				+   virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) override {
			
 
				+     return this->set_run_dims(index(name), dims);
			
 
				+   }
			
 
				+ 
			
 
				+   virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) override {
			
 
				+     nvinfer1::Dims d;
			
 
				+     for (int i = 0; i < dims.size(); ++i) {
			
 
				+         d.d[i] = static_cast<int64_t>(dims[i]);  // 转换为 int64_t
			
 
				+     }
			
 
				+     // memcpy(d.d, dims.data(), sizeof(int) * dims.size());
			
 
				+     d.nbDims = dims.size();
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     auto context = this->context_->context_;
			
 
				+     // return context->setInputShape("images", nvinfer1::Dims{4, {1, 3, 640, 640}});
			
 
				+     return context->setInputShape(engine->getIOTensorName(ibinding), d);
			
 
				+   }
			
 
				+ 
			
 
				+   virtual int numel(const std::string &name) override { return numel(index(name)); }
			
 
				+ 
			
 
				+   virtual int numel(int ibinding) override {
			
 
				+     auto dim = this->run_dims(ibinding);
			
 
				+     return std::accumulate(dim.begin(), dim.end(), 1, std::multiplies<int>());
			
 
				+   }
			
 
				+ 
			
 
				+   virtual DType dtype(const std::string &name) override { return dtype(index(name)); }
			
 
				+ 
			
 
				+   virtual DType dtype(int ibinding) override { 
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     auto dtype = engine->getTensorDataType(engine->getIOTensorName(ibinding));
			
 
				+     switch(dtype){
			
 
				+       case nvinfer1::DataType::kFLOAT: return DType::FLOAT;
			
 
				+       case nvinfer1::DataType::kHALF: return DType::HALF;
			
 
				+       case nvinfer1::DataType::kINT8: return DType::INT8;
			
 
				+       case nvinfer1::DataType::kINT32: return DType::INT32;
			
 
				+       case nvinfer1::DataType::kBOOL: return DType::BOOL;
			
 
				+       case nvinfer1::DataType::kUINT8: return DType::UINT8;
			
 
				+ 
			
 
				+       #if NV_TENSORRT_MAJOR >= 10
			
 
				+         case nvinfer1::DataType::kFP8: return DType::FP8;
			
 
				+         case nvinfer1::DataType::kBF16: return DType::BF16;
			
 
				+         case nvinfer1::DataType::kINT64: return DType::INT64;
			
 
				+         case nvinfer1::DataType::kINT4: return DType::INT4;
			
 
				+       #endif
			
 
				+ 
			
 
				+       default: return DType::NONE;
			
 
				+     }
			
 
				+   }
			
 
				+ 
			
 
				+   virtual bool has_dynamic_dim() override {
			
 
				+     // check if any input or output bindings have dynamic shapes
			
 
				+     // code from ChatGPT
			
 
				+     int numBindings = this->num_bindings();
			
 
				+     for (int i = 0; i < numBindings; ++i) {
			
 
				+       auto dims = this->static_dims(i);
			
 
				+       for (size_t j = 0; j < dims.size(); ++j) {
			
 
				+         if (dims[j] == -1) return true;
			
 
				+       }
			
 
				+     }
			
 
				+     return false;
			
 
				+   }
			
 
				+ 
			
 
				+   virtual void print(const char *name) override {
			
 
				+     printf("------------------------------------------------------\n");
			
 
				+     printf("%s 🌱 is %s model\n", name, has_dynamic_dim() ? "Dynamic Shape" : "Static Shape");
			
 
				+ 
			
 
				+     int num_input = 0;
			
 
				+     int num_output = 0;
			
 
				+     auto engine = this->context_->engine_;
			
 
				+     for (int i = 0; i < this->num_bindings(); ++i) {
			
 
				+       if (this->is_input(i))
			
 
				+         num_input++;
			
 
				+       else
			
 
				+         num_output++;
			
 
				+     }
			
 
				+ 
			
 
				+     printf("Inputs: %d\n", num_input);
			
 
				+     for (int i = 0; i < num_input; ++i) {
			
 
				+       auto name = engine->getIOTensorName(i);
			
 
				+       auto dim = engine->getTensorShape(name);
			
 
				+       auto dtype = engine->getTensorDataType(name);
			
 
				+       printf("\t%d.%s : {%s} [%s]\n", i, name, format_shape(dim).c_str(), data_type_string(dtype));
			
 
				+     }
			
 
				+ 
			
 
				+     printf("Outputs: %d\n", num_output);
			
 
				+     for (int i = 0; i < num_output; ++i) {
			
 
				+       auto name = engine->getIOTensorName(i + num_input);
			
 
				+       auto dim = engine->getTensorShape(name);
			
 
				+       auto dtype = engine->getTensorDataType(name);
			
 
				+       printf("\t%d.%s : {%s} [%s]\n", i, name, format_shape(dim).c_str(), data_type_string(dtype));
			
 
				+     }
			
 
				+     printf("------------------------------------------------------\n");
			
 
				+   }
			
 
				+ };
			
 
				+ 
			
 
				+ std::shared_ptr<Engine> load(const std::string &file) {
			
 
				+   std::shared_ptr<EngineImplement> impl(new EngineImplement());
			
 
				+   if (!impl->load(file)) impl.reset();
			
 
				+   return impl;
			
 
				+ }
			
 
				+ 
			
 
				+ };  // namespace TensorRT
			
--- a/src/common/tensorrt.hpp
+++ b/src/common/tensorrt.hpp
@@ -0,0 +1,61 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: MIT
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+ #ifndef __TENSORRT_HPP__
			
 
				+ #define __TENSORRT_HPP__
			
 
				+ 
			
 
				+ #include <memory>
			
 
				+ #include <string>
			
 
				+ #include <vector>
			
 
				+ #include <unordered_map>
			
 
				+ 
			
 
				+ namespace TensorRT10 {
			
 
				+ 
			
 
				+ enum class DType : int { FLOAT = 0, HALF = 1, INT8 = 2, INT32 = 3, BOOL = 4, UINT8 = 5, FP8 = 6, BF16 = 7, INT64 = 8, INT4 = 9, NONE=-1 };
			
 
				+ 
			
 
				+ class Engine {
			
 
				+  public:
			
 
				+   virtual ~Engine() = default;
			
 
				+   virtual bool forward(const std::unordered_map<std::string, const void *> &bindings, void *stream = nullptr, void *input_consum_event = nullptr) = 0;
			
 
				+   virtual int index(const std::string &name) = 0;
			
 
				+   virtual std::vector<int> run_dims(const std::string &name) = 0;
			
 
				+   virtual std::vector<int> run_dims(int ibinding) = 0;
			
 
				+   virtual std::vector<int> static_dims(const std::string &name) = 0;
			
 
				+   virtual std::vector<int> static_dims(int ibinding) = 0;
			
 
				+   virtual int numel(const std::string &name) = 0;
			
 
				+   virtual int numel(int ibinding) = 0;
			
 
				+   virtual int num_bindings() = 0;
			
 
				+   virtual bool is_input(int ibinding) = 0;
			
 
				+   virtual bool is_input(const std::string &name) = 0;
			
 
				+   virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) = 0;
			
 
				+   virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) = 0;
			
 
				+   virtual DType dtype(const std::string &name) = 0;
			
 
				+   virtual DType dtype(int ibinding) = 0;
			
 
				+   virtual bool has_dynamic_dim() = 0;
			
 
				+   virtual void print(const char *name = "TensorRT-Engine") = 0;
			
 
				+ };
			
 
				+ 
			
 
				+ std::shared_ptr<Engine> load(const std::string &file);
			
 
				+ };  // namespace TensorRT
			
 
				+ 
			
 
				+ #endif  // __TENSORRT_HPP__
			
--- a/src/common/tensorrt8.cpp
+++ b/src/common/tensorrt8.cpp
@@ -0,0 +1,301 @@
 
				+#include "common/tensorrt8.hpp"
			
 
				+#include "common/check.hpp"
			
 
				+#include <iostream>
			
 
				+#include <cstring>
			
 
				+#include <NvInfer.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <stdarg.h>
			
 
				+#include <fstream>
			
 
				+#include <numeric>
			
 
				+#include <sstream>
			
 
				+#include <unordered_map>
			
 
				+
			
 
				+
			
 
				+namespace TensorRT8
			
 
				+{
			
 
				+
			
 
				+using namespace std;
			
 
				+using namespace nvinfer1;
			
 
				+
			
 
				+static class Logger : public nvinfer1::ILogger {
			
 
				+ public:
			
 
				+  void log(Severity severity, const char *msg) noexcept override {
			
 
				+    if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) {
			
 
				+      std::cerr << "[NVINFER LOG]: " << msg << std::endl;
			
 
				+    }
			
 
				+  }
			
 
				+} gLogger_;
			
 
				+
			
 
				+template <typename _T>
			
 
				+static void destroy_nvidia_pointer(_T *ptr)
			
 
				+{
			
 
				+    if (ptr) ptr->destroy();
			
 
				+}
			
 
				+
			
 
				+static std::string format_shape(const Dims &shape) 
			
 
				+{
			
 
				+    stringstream output;
			
 
				+    char buf[64];
			
 
				+    const char *fmts[] = {"%d", "x%d"};
			
 
				+    for (int i = 0; i < shape.nbDims; ++i)
			
 
				+    {
			
 
				+        snprintf(buf, sizeof(buf), fmts[i != 0], shape.d[i]);
			
 
				+        output << buf;
			
 
				+    }
			
 
				+  return output.str();
			
 
				+}
			
 
				+
			
 
				+static std::vector<uint8_t> load_file(const string &file) 
			
 
				+{
			
 
				+    ifstream in(file, ios::in | ios::binary);
			
 
				+    if (!in.is_open()) return {};
			
 
				+
			
 
				+    in.seekg(0, ios::end);
			
 
				+    size_t length = in.tellg();
			
 
				+
			
 
				+    std::vector<uint8_t> data;
			
 
				+    if (length > 0) 
			
 
				+    {
			
 
				+        in.seekg(0, ios::beg);
			
 
				+        data.resize(length);
			
 
				+
			
 
				+        in.read((char *)&data[0], length);
			
 
				+    }
			
 
				+    in.close();
			
 
				+    return data;
			
 
				+}
			
 
				+
			
 
				+class __native_engine_context 
			
 
				+{
			
 
				+public:
			
 
				+    virtual ~__native_engine_context() { destroy(); }
			
 
				+
			
 
				+    bool construct(const void *pdata, size_t size) 
			
 
				+    {
			
 
				+        destroy();
			
 
				+
			
 
				+        if (pdata == nullptr || size == 0) return false;
			
 
				+
			
 
				+        runtime_ = shared_ptr<IRuntime>(createInferRuntime(gLogger_), destroy_nvidia_pointer<IRuntime>);
			
 
				+        if (runtime_ == nullptr) return false;
			
 
				+
			
 
				+        engine_ = shared_ptr<ICudaEngine>(runtime_->deserializeCudaEngine(pdata, size, nullptr),
			
 
				+                                        destroy_nvidia_pointer<ICudaEngine>);
			
 
				+        if (engine_ == nullptr) return false;
			
 
				+
			
 
				+        context_ = shared_ptr<IExecutionContext>(engine_->createExecutionContext(),
			
 
				+                                                destroy_nvidia_pointer<IExecutionContext>);
			
 
				+        return context_ != nullptr;
			
 
				+    }
			
 
				+
			
 
				+private:
			
 
				+    void destroy() 
			
 
				+    {
			
 
				+        context_.reset();
			
 
				+        engine_.reset();
			
 
				+        runtime_.reset();
			
 
				+    }
			
 
				+
			
 
				+public:
			
 
				+    shared_ptr<IExecutionContext> context_;
			
 
				+    shared_ptr<ICudaEngine> engine_;
			
 
				+    shared_ptr<IRuntime> runtime_ = nullptr;
			
 
				+};
			
 
				+
			
 
				+class EngineImplement : public Engine 
			
 
				+{
			
 
				+public:
			
 
				+    shared_ptr<__native_engine_context> context_;
			
 
				+    unordered_map<string, int> binding_name_to_index_;
			
 
				+
			
 
				+    virtual ~EngineImplement() = default;
			
 
				+
			
 
				+    bool construct(const void *data, size_t size) 
			
 
				+    {
			
 
				+        context_ = make_shared<__native_engine_context>();
			
 
				+        if (!context_->construct(data, size)) 
			
 
				+        {
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        setup();
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool load(const string &file) 
			
 
				+    {
			
 
				+        auto data = load_file(file);
			
 
				+        if (data.empty()) 
			
 
				+        {
			
 
				+            printf("An empty file has been loaded. Please confirm your file path: %s\n", file.c_str());
			
 
				+            return false;
			
 
				+        }
			
 
				+        return this->construct(data.data(), data.size());
			
 
				+    }
			
 
				+
			
 
				+    void setup() 
			
 
				+    {
			
 
				+        auto engine = this->context_->engine_;
			
 
				+        int nbBindings = engine->getNbBindings();
			
 
				+
			
 
				+        binding_name_to_index_.clear();
			
 
				+        for (int i = 0; i < nbBindings; ++i) 
			
 
				+        {
			
 
				+            const char *bindingName = engine->getBindingName(i);
			
 
				+            binding_name_to_index_[bindingName] = i;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    virtual int index(const std::string &name) override 
			
 
				+    {
			
 
				+        auto iter = binding_name_to_index_.find(name);
			
 
				+        Assertf(iter != binding_name_to_index_.end(), "Can not found the binding name: %s",
			
 
				+                name.c_str());
			
 
				+        return iter->second;
			
 
				+    }
			
 
				+
			
 
				+    virtual bool forward(const std::vector<void *> &bindings, void *stream,
			
 
				+                        void *input_consum_event) override 
			
 
				+    {
			
 
				+        return this->context_->context_->enqueueV2((void**)bindings.data(), (cudaStream_t)stream,
			
 
				+                                                    (cudaEvent_t *)input_consum_event);
			
 
				+    }
			
 
				+
			
 
				+    virtual std::vector<int> run_dims(const std::string &name) override 
			
 
				+    {
			
 
				+        return run_dims(index(name));
			
 
				+    }
			
 
				+
			
 
				+    virtual std::vector<int> run_dims(int ibinding) override
			
 
				+    {
			
 
				+        auto dim = this->context_->context_->getBindingDimensions(ibinding);
			
 
				+        return std::vector<int>(dim.d, dim.d + dim.nbDims);
			
 
				+    }
			
 
				+
			
 
				+    virtual std::vector<int> static_dims(const std::string &name) override 
			
 
				+    {
			
 
				+        return static_dims(index(name));
			
 
				+    }
			
 
				+
			
 
				+    virtual std::vector<int> static_dims(int ibinding) override 
			
 
				+    {
			
 
				+        auto dim = this->context_->engine_->getBindingDimensions(ibinding);
			
 
				+        return std::vector<int>(dim.d, dim.d + dim.nbDims);
			
 
				+    }
			
 
				+
			
 
				+    virtual int num_bindings() override { return this->context_->engine_->getNbBindings(); }
			
 
				+
			
 
				+    virtual bool is_input(int ibinding) override 
			
 
				+    {
			
 
				+        return this->context_->engine_->bindingIsInput(ibinding);
			
 
				+    }
			
 
				+
			
 
				+    virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) override 
			
 
				+    {
			
 
				+        return this->set_run_dims(index(name), dims);
			
 
				+    }
			
 
				+
			
 
				+    virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) override 
			
 
				+    {
			
 
				+        Dims d;
			
 
				+        memcpy(d.d, dims.data(), sizeof(int) * dims.size());
			
 
				+        d.nbDims = dims.size();
			
 
				+        return this->context_->context_->setBindingDimensions(ibinding, d);
			
 
				+    }
			
 
				+
			
 
				+    virtual int numel(const std::string &name) override { return numel(index(name)); }
			
 
				+
			
 
				+    virtual int numel(int ibinding) override 
			
 
				+    {
			
 
				+        auto dim = this->context_->context_->getBindingDimensions(ibinding);
			
 
				+        return std::accumulate(dim.d, dim.d + dim.nbDims, 1, std::multiplies<int>());
			
 
				+    }
			
 
				+
			
 
				+    virtual DType dtype(const std::string &name) override { return dtype(index(name)); }
			
 
				+
			
 
				+    virtual DType dtype(int ibinding) override 
			
 
				+    {
			
 
				+        return (DType)this->context_->engine_->getBindingDataType(ibinding);
			
 
				+    }
			
 
				+
			
 
				+    virtual bool has_dynamic_dim() override 
			
 
				+    {
			
 
				+        // check if any input or output bindings have dynamic shapes
			
 
				+        // code from ChatGPT
			
 
				+        int numBindings = this->context_->engine_->getNbBindings();
			
 
				+        for (int i = 0; i < numBindings; ++i) 
			
 
				+        {
			
 
				+            nvinfer1::Dims dims = this->context_->engine_->getBindingDimensions(i);
			
 
				+            for (int j = 0; j < dims.nbDims; ++j) 
			
 
				+            {
			
 
				+                if (dims.d[j] == -1) return true;
			
 
				+            }
			
 
				+        }
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    virtual void print() override 
			
 
				+    {
			
 
				+        printf("------------------------------------------------------\n");
			
 
				+        printf("Engine %p [%s]\n", this, has_dynamic_dim() ? "DynamicShape" : "StaticShape");
			
 
				+
			
 
				+        int num_input = 0;
			
 
				+        int num_output = 0;
			
 
				+        auto engine = this->context_->engine_;
			
 
				+        for (int i = 0; i < engine->getNbBindings(); ++i) 
			
 
				+        {
			
 
				+            if (engine->bindingIsInput(i))
			
 
				+            num_input++;
			
 
				+            else
			
 
				+            num_output++;
			
 
				+        }
			
 
				+
			
 
				+        printf("Inputs: %d\n", num_input);
			
 
				+        for (int i = 0; i < num_input; ++i) 
			
 
				+        {
			
 
				+            auto name = engine->getBindingName(i);
			
 
				+            auto dim = engine->getBindingDimensions(i);
			
 
				+            printf("\t%d.%s : shape {%s}\n", i, name, format_shape(dim).c_str());
			
 
				+        }
			
 
				+
			
 
				+        printf("Outputs: %d\n", num_output);
			
 
				+        for (int i = 0; i < num_output; ++i) 
			
 
				+        {
			
 
				+            auto name = engine->getBindingName(i + num_input);
			
 
				+            auto dim = engine->getBindingDimensions(i + num_input);
			
 
				+            printf("\t%d.%s : shape {%s}\n", i, name, format_shape(dim).c_str());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+};
			
 
				+
			
 
				+Engine *loadraw(const std::string &file) 
			
 
				+{
			
 
				+    EngineImplement *impl = new EngineImplement();
			
 
				+    if (!impl->load(file)) 
			
 
				+    {
			
 
				+        delete impl;
			
 
				+        impl = nullptr;
			
 
				+    }
			
 
				+    return impl;
			
 
				+}
			
 
				+
			
 
				+std::shared_ptr<Engine> load(const std::string &file) 
			
 
				+{
			
 
				+    return std::shared_ptr<EngineImplement>((EngineImplement *)loadraw(file));
			
 
				+}
			
 
				+
			
 
				+std::string format_shape(const std::vector<int> &shape) 
			
 
				+{
			
 
				+    stringstream output;
			
 
				+    char buf[64];
			
 
				+    const char *fmts[] = {"%d", "x%d"};
			
 
				+    for (int i = 0; i < (int)shape.size(); ++i) 
			
 
				+    {
			
 
				+        snprintf(buf, sizeof(buf), fmts[i != 0], shape[i]);
			
 
				+        output << buf;
			
 
				+    }
			
 
				+    return output.str();
			
 
				+}
			
 
				+
			
 
				+}
			
--- a/src/common/tensorrt8.hpp
+++ b/src/common/tensorrt8.hpp
@@ -0,0 +1,38 @@
 
				+#ifndef TENSORRT8_HPP__
			
 
				+#define TENSORRT8_HPP__
			
 
				+#include <vector>
			
 
				+#include "common/memory.hpp"
			
 
				+
			
 
				+namespace TensorRT8
			
 
				+{
			
 
				+
			
 
				+enum class DType : int { FLOAT = 0, HALF = 1, INT8 = 2, INT32 = 3, BOOL = 4, UINT8 = 5 };
			
 
				+
			
 
				+class Engine 
			
 
				+{
			
 
				+public:
			
 
				+    virtual bool forward(const std::vector<void *> &bindings, void *stream = nullptr,
			
 
				+                        void *input_consum_event = nullptr) = 0;
			
 
				+    virtual int index(const std::string &name) = 0;
			
 
				+    virtual std::vector<int> run_dims(const std::string &name) = 0;
			
 
				+    virtual std::vector<int> run_dims(int ibinding) = 0;
			
 
				+    virtual std::vector<int> static_dims(const std::string &name) = 0;
			
 
				+    virtual std::vector<int> static_dims(int ibinding) = 0;
			
 
				+    virtual int numel(const std::string &name) = 0;
			
 
				+    virtual int numel(int ibinding) = 0;
			
 
				+    virtual int num_bindings() = 0;
			
 
				+    virtual bool is_input(int ibinding) = 0;
			
 
				+    virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) = 0;
			
 
				+    virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) = 0;
			
 
				+    virtual DType dtype(const std::string &name) = 0;
			
 
				+    virtual DType dtype(int ibinding) = 0;
			
 
				+    virtual bool has_dynamic_dim() = 0;
			
 
				+    virtual void print() = 0;
			
 
				+};
			
 
				+
			
 
				+std::shared_ptr<Engine> load(const std::string &file);
			
 
				+std::string format_shape(const std::vector<int> &shape);
			
 
				+
			
 
				+}  // namespace trt
			
 
				+
			
 
				+#endif
			
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -12,4 +12,4 @@ std::string getTimeString()
 
				     std::ostringstream oss;
			
 
				     oss << std::put_time(&tm, "%Y_%m_%d_%H_%M_%S");
			
 
				     return oss.str();
			
 
				-}
			
 
				+}
			
--- a/src/common/utils.hpp
+++ b/src/common/utils.hpp
@@ -5,5 +5,17 @@
 
				 
			
 
				 std::string getTimeString();
			
 
				 
			
 
				+template<typename ... Args>
			
 
				+static std::string str_format(const std::string &format, Args ... args)
			
 
				+{
			
 
				+	auto size_buf = std::snprintf(nullptr, 0, format.c_str(), args ...) + 1; 
			
 
				+	std::unique_ptr<char[]> buf(new(std::nothrow) char[size_buf]);
			
 
				+
			
 
				+	if (!buf)
			
 
				+		return std::string("");
			
 
				+
			
 
				+	std::snprintf(buf.get(), size_buf, format.c_str(), args ...);
			
 
				+	return std::string(buf.get(), buf.get() + size_buf - 1); 
			
 
				+}
			
 
				 
			
 
				 #endif
			
--- a/src/infer/cpm.hpp
+++ b/src/infer/cpm.hpp
@@ -1,154 +0,0 @@
 
				-#ifndef __CPM_HPP__
			
 
				-#define __CPM_HPP__
			
 
				-
			
 
				-// Comsumer Producer Model
			
 
				-
			
 
				-#include <algorithm>
			
 
				-#include <condition_variable>
			
 
				-#include <future>
			
 
				-#include <memory>
			
 
				-#include <queue>
			
 
				-#include <thread>
			
 
				-
			
 
				-namespace cpm {
			
 
				-
			
 
				-template <typename Result, typename Input, typename Model>
			
 
				-class Instance {
			
 
				- protected:
			
 
				-  struct Item {
			
 
				-    Input input;
			
 
				-    std::shared_ptr<std::promise<Result>> pro;
			
 
				-  };
			
 
				-
			
 
				-  std::condition_variable cond_;
			
 
				-  std::queue<Item> input_queue_;
			
 
				-  std::mutex queue_lock_;
			
 
				-  std::shared_ptr<std::thread> worker_;
			
 
				-  volatile bool run_ = false;
			
 
				-  volatile int max_items_processed_ = 0;
			
 
				-  void *stream_ = nullptr;
			
 
				-
			
 
				- public:
			
 
				-  virtual ~Instance() { stop(); }
			
 
				-
			
 
				-  void stop() {
			
 
				-    run_ = false;
			
 
				-    cond_.notify_one();
			
 
				-    {
			
 
				-      std::unique_lock<std::mutex> l(queue_lock_);
			
 
				-      while (!input_queue_.empty()) {
			
 
				-        auto &item = input_queue_.front();
			
 
				-        if (item.pro) item.pro->set_value(Result());
			
 
				-        input_queue_.pop();
			
 
				-      }
			
 
				-    };
			
 
				-
			
 
				-    if (worker_) {
			
 
				-      worker_->join();
			
 
				-      worker_.reset();
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-  virtual std::shared_future<Result> commit(const Input &input) {
			
 
				-    Item item;
			
 
				-    item.input = input;
			
 
				-    item.pro.reset(new std::promise<Result>());
			
 
				-    {
			
 
				-      std::unique_lock<std::mutex> __lock_(queue_lock_);
			
 
				-      input_queue_.push(item);
			
 
				-    }
			
 
				-    cond_.notify_one();
			
 
				-    return item.pro->get_future();
			
 
				-  }
			
 
				-
			
 
				-  virtual std::vector<std::shared_future<Result>> commits(const std::vector<Input> &inputs) {
			
 
				-    std::vector<std::shared_future<Result>> output;
			
 
				-    {
			
 
				-      std::unique_lock<std::mutex> __lock_(queue_lock_);
			
 
				-      for (int i = 0; i < (int)inputs.size(); ++i) {
			
 
				-        Item item;
			
 
				-        item.input = inputs[i];
			
 
				-        item.pro.reset(new std::promise<Result>());
			
 
				-        output.emplace_back(item.pro->get_future());
			
 
				-        input_queue_.push(item);
			
 
				-      }
			
 
				-    }
			
 
				-    cond_.notify_one();
			
 
				-    return output;
			
 
				-  }
			
 
				-
			
 
				-  template <typename LoadMethod>
			
 
				-  bool start(const LoadMethod &loadmethod, int max_items_processed = 1, void *stream = nullptr) {
			
 
				-    stop();
			
 
				-
			
 
				-    this->stream_ = stream;
			
 
				-    this->max_items_processed_ = max_items_processed;
			
 
				-    std::promise<bool> status;
			
 
				-    worker_ = std::make_shared<std::thread>(&Instance::worker<LoadMethod>, this,
			
 
				-                                            std::ref(loadmethod), std::ref(status));
			
 
				-    return status.get_future().get();
			
 
				-  }
			
 
				-
			
 
				- private:
			
 
				-  template <typename LoadMethod>
			
 
				-  void worker(const LoadMethod &loadmethod, std::promise<bool> &status) {
			
 
				-    std::shared_ptr<Model> model = loadmethod();
			
 
				-    if (model == nullptr) {
			
 
				-      status.set_value(false);
			
 
				-      return;
			
 
				-    }
			
 
				-
			
 
				-    run_ = true;
			
 
				-    status.set_value(true);
			
 
				-
			
 
				-    std::vector<Item> fetch_items;
			
 
				-    std::vector<Input> inputs;
			
 
				-    while (get_items_and_wait(fetch_items, max_items_processed_)) {
			
 
				-      inputs.resize(fetch_items.size());
			
 
				-      std::transform(fetch_items.begin(), fetch_items.end(), inputs.begin(),
			
 
				-                     [](Item &item) { return item.input; });
			
 
				-
			
 
				-      auto ret = model->forwards(inputs, stream_);
			
 
				-      for (int i = 0; i < (int)fetch_items.size(); ++i) {
			
 
				-        if (i < (int)ret.size()) {
			
 
				-          fetch_items[i].pro->set_value(ret[i]);
			
 
				-        } else {
			
 
				-          fetch_items[i].pro->set_value(Result());
			
 
				-        }
			
 
				-      }
			
 
				-      inputs.clear();
			
 
				-      fetch_items.clear();
			
 
				-    }
			
 
				-    model.reset();
			
 
				-    run_ = false;
			
 
				-  }
			
 
				-
			
 
				-  virtual bool get_items_and_wait(std::vector<Item> &fetch_items, int max_size) {
			
 
				-    std::unique_lock<std::mutex> l(queue_lock_);
			
 
				-    cond_.wait(l, [&]() { return !run_ || !input_queue_.empty(); });
			
 
				-
			
 
				-    if (!run_) return false;
			
 
				-
			
 
				-    fetch_items.clear();
			
 
				-    for (int i = 0; i < max_size && !input_queue_.empty(); ++i) {
			
 
				-      fetch_items.emplace_back(std::move(input_queue_.front()));
			
 
				-      input_queue_.pop();
			
 
				-    }
			
 
				-    return true;
			
 
				-  }
			
 
				-
			
 
				-  virtual bool get_item_and_wait(Item &fetch_item) {
			
 
				-    std::unique_lock<std::mutex> l(queue_lock_);
			
 
				-    cond_.wait(l, [&]() { return !run_ || !input_queue_.empty(); });
			
 
				-
			
 
				-    if (!run_) return false;
			
 
				-
			
 
				-    fetch_item = std::move(input_queue_.front());
			
 
				-    input_queue_.pop();
			
 
				-    return true;
			
 
				-  }
			
 
				-};
			
 
				-
			
 
				-}  // namespace cpm
			
 
				-
			
 
				-#endif  // __CPM_HPP__
			
--- a/src/infer/infer.cpp
+++ b/src/infer/infer.cpp
@@ -0,0 +1,23 @@
 
				+#include "infer/infer.hpp"
			
 
				+
			
 
				+#include "infer/trt/yolo.hpp"
			
 
				+#include <iostream>
			
 
				+#include <memory>
			
 
				+
			
 
				+
			
 
				+std::shared_ptr<Infer> load(const std::string& model_path, ModelType model_type, const std::vector<std::string>& names, int gpu_id, float confidence_threshold, float nms_threshold)
			
 
				+{
			
 
				+    std::shared_ptr<Infer> infer;
			
 
				+    switch (model_type)
			
 
				+    {
			
 
				+    case ModelType::YOLOV5:
			
 
				+    case ModelType::YOLOV8:
			
 
				+    case ModelType::YOLO11:
			
 
				+    case ModelType::YOLO11POSE:
			
 
				+        infer = yolo::load_yolo(model_path, model_type, gpu_id, confidence_threshold, nms_threshold);
			
 
				+        break;
			
 
				+    default:
			
 
				+        break;
			
 
				+    }
			
 
				+    return infer;
			
 
				+}
			
--- a/src/infer/infer.hpp
+++ b/src/infer/infer.hpp
@@ -15,7 +15,8 @@ enum class ModelType : int{
 
				 
			
 
				 class Infer{
			
 
				 public:
			
 
				-    virtual data::BoxArray forward() = 0;
			
 
				+    virtual data::BoxArray forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream = nullptr) = 0;
			
 
				+    virtual data::BoxArray forward(const tensor::Image &image, void *stream = nullptr) = 0;
			
 
				 }
			
 
				 
			
 
				 std::shared_ptr<Infer> load(const std::string& model_path, ModelType model_type, const std::vector<std::string>& names={}, int gpu_id=0, float confidence_threshold=0.5f, float nms_threshold=0.45f);
			
--- a/src/infer/opencv/yolov5.cpp
+++ b/src/infer/opencv/yolov5.cpp
@@ -1,184 +0,0 @@
 
				-#ifndef YOLO_HPP__
			
 
				-#define YOLO_HPP__
			
 
				-
			
 
				-#include "common/data.hpp"
			
 
				-#include "infer/infer.hpp"
			
 
				-
			
 
				-#include "opencv2/opencv.hpp"
			
 
				-#include "opencv2/dnn.hpp"
			
 
				-#include <memory>
			
 
				-#include <string>
			
 
				-#include <vector>
			
 
				-#include <iostream>
			
 
				-
			
 
				-
			
 
				-namespace yolo
			
 
				-{
			
 
				-
			
 
				-class Yolov5InferImpl : public Infer
			
 
				-{
			
 
				-public:
			
 
				-    ModelType model_type;
			
 
				-    std::shared_ptr<cv::dnn::Net> net_;
			
 
				-
			
 
				-    float confidence_threshold_;
			
 
				-    float nms_threshold_;
			
 
				-    int network_input_width_, network_input_height_;
			
 
				-
			
 
				-
			
 
				-
			
 
				-    std::vector<std::string> names_;
			
 
				-
			
 
				-    bool load(const std::string& model_path, const std::vector<std::string>& names, float confidence_threshold=0.5f, float nms_threshold=0.45f)
			
 
				-    {
			
 
				-        net_ = std::make_shared<cv::dnn::Net>(cv::dnn::readNet(model_path));
			
 
				-        // 获取模型输入层名称
			
 
				-        std::vector<std::string> inputNames = net->getLayerNames();
			
 
				-        
			
 
				-        // 获取输入层的形状信息
			
 
				-        std::vector<std::vector<int>> inShapes, outShapes;
			
 
				-        net.getLayerShapes(cv::dnn::Dict(), 0, inShapes, outShapes);
			
 
				-
			
 
				-        if (!inShapes.empty()) {
			
 
				-            int batchSize = inShapes[0][0]; // 批次大小（通常为1）
			
 
				-            int channels = inShapes[0][1];  // 通道数
			
 
				-            network_input_height_ = inShapes[0][2];    // 高度
			
 
				-            network_input_width_ = inShapes[0][3];     // 宽度
			
 
				-
			
 
				-            std::cout << "Model Input Shape: " << batchSize << "x" 
			
 
				-                    << channels << "x" << network_input_height_ << "x" << network_input_width_ << std::endl;
			
 
				-        } else {
			
 
				-            std::cout << "Failed to get input shape!" << std::endl;
			
 
				-        }
			
 
				-        return true;
			
 
				-    }
			
 
				-
			
 
				-    void warpAffine(cv::Mat& src_image, cv::Mat& dst_image, float *d2i)
			
 
				-    {
			
 
				-        int src_image_width  = src_image.cols;
			
 
				-        int src_image_height = src_image.rows;
			
 
				-
			
 
				-        float scale_x = network_input_width_  / (float)src_image_width;
			
 
				-        float scale_y = network_input_height_ / (float)src_image_height;
			
 
				-        float scale   = std::min(scale_x, scale_y);
			
 
				-        float i2d[6];
			
 
				-        i2d[0] = scale;
			
 
				-        i2d[1] = 0;
			
 
				-        i2d[2] = (-scale * src_image_width + network_input_width_ + scale - 1) * 0.5;
			
 
				-        i2d[3] = 0;
			
 
				-        i2d[4] = scale;
			
 
				-        i2d[5] = (-scale * src_image_height + network_input_height_ + scale - 1) * 0.5;
			
 
				-
			
 
				-        cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
			
 
				-        cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
			
 
				-        cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);
			
 
				-
			
 
				-        dst_image.create(network_input_height_, network_input_width_, CV_8UC3);
			
 
				-        cv::warpAffine(src_image, dst_image, m2x3_i2d, dst_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));
			
 
				-    }
			
 
				-
			
 
				-    void decode(std::vector<cv::Mat>& outs,
			
 
				-        data::BoxArray& result_boxes,
			
 
				-        float *d2i,
			
 
				-        int src_image_width,
			
 
				-        int src_image_height)
			
 
				-    {
			
 
				-        data::BoxArray boxes;
			
 
				-        int cols = outs[0].size[2];
			
 
				-        int rows = outs[0].size[1];
			
 
				-        float* predict = (float*)outs[0].data;
			
 
				-        int num_classes = cols - 5;
			
 
				-    
			
 
				-        for(int i = 0; i < rows; ++i)
			
 
				-        {
			
 
				-            float* pitem = predict + i * cols;
			
 
				-            float objness = pitem[4];
			
 
				-            if (objness < confidence_threshold_)
			
 
				-            {
			
 
				-                continue;
			
 
				-            }
			
 
				-    
			
 
				-            float* pclass = pitem + 5;
			
 
				-            int label     = std::max_element(pclass, pclass + num_classes) - pclass;
			
 
				-            float prob    = pclass[label];
			
 
				-            float confidence = prob * objness;
			
 
				-            if(confidence < confidence_threshold_)
			
 
				-            {
			
 
				-                continue;
			
 
				-            }
			
 
				-            float cx     = pitem[0];
			
 
				-            float cy     = pitem[1];
			
 
				-            float width  = pitem[2];
			
 
				-            float height = pitem[3];
			
 
				-    
			
 
				-            // 通过反变换恢复到图像尺度
			
 
				-            float left   = (cx - width * 0.5) * d2i[0] + d2i[2];
			
 
				-            float top    = (cy - height * 0.5) * d2i[0] + d2i[5];
			
 
				-            float right  = (cx + width * 0.5) * d2i[0] + d2i[2];
			
 
				-            float bottom = (cy + height * 0.5) * d2i[0] + d2i[5];
			
 
				-            boxes.emplace_back(left, top, right, bottom, confidence, names_[label]);
			
 
				-        }
			
 
				-        std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){return a.confidence > b.confidence;});
			
 
				-        std::vector<bool> remove_flags(boxes.size());
			
 
				-        result_boxes.reserve(boxes.size());
			
 
				-    
			
 
				-        auto iou = [](const Box& a, const Box& b){
			
 
				-            int cross_left   = std::max(a.left, b.left);
			
 
				-            int cross_top    = std::max(a.top, b.top);
			
 
				-            int cross_right  = std::min(a.right, b.right);
			
 
				-            int cross_bottom = std::min(a.bottom, b.bottom);
			
 
				-    
			
 
				-            int cross_area = std::max(0, cross_right - cross_left) * std::max(0, cross_bottom - cross_top);
			
 
				-            int union_area = std::max(0.f, a.right - a.left) * std::max(0.f, a.bottom - a.top)
			
 
				-                            + std::max(0.f, b.right - b.left) * std::max(0.f, b.bottom - b.top) - cross_area;
			
 
				-            if(cross_area == 0 || union_area == 0) return 0.0f;
			
 
				-            return 1.0f * cross_area / union_area;
			
 
				-        };
			
 
				-    
			
 
				-        for(int i = 0; i < boxes.size(); ++i)
			
 
				-        {
			
 
				-            if(remove_flags[i]) continue;
			
 
				-    
			
 
				-            auto& ibox = boxes[i];
			
 
				-            result_boxes.emplace_back(ibox);
			
 
				-            for (int j = i + 1; j < boxes.size(); ++j)
			
 
				-            {
			
 
				-                if (remove_flags[j]) continue;
			
 
				-    
			
 
				-                auto& jbox = boxes[j];
			
 
				-                if (ibox.class_id == jbox.class_id)
			
 
				-                {
			
 
				-                    // class matched
			
 
				-                    if (iou(ibox, jbox) >= nms_threshold_)
			
 
				-                        remove_flags[j] = true;
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-    
			
 
				-    }
			
 
				-
			
 
				-    virtual data::BoxArray forward(cv::Mat& image) override
			
 
				-    {
			
 
				-        float d2i[6];
			
 
				-        cv::Mat affine_image;
			
 
				-        warpAffine(image, affine_image, d2i);
			
 
				-        std::vector<cv::Mat> outs;
			
 
				-        auto blob = cv::dnn::blobFromImage(affine_image, 1 / 255.0, cv::Size(network_input_height_, network_input_width_), cv::Scalar(0, 0, 0), true, false);
			
 
				-        net_->setInput(blob);
			
 
				-        net_->forward(outs, net_.getUnconnectedOutLayersNames());
			
 
				-        data::BoxArray result;
			
 
				-        decode(outs, result, d2i, image.cols, image.rows);
			
 
				-        return result;
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-std::shared_ptr<Infer> load(const std::string &engine_file, ModelType model_type, const std::vector<std::string>& names, int gpu_id, float confidence_threshold, float nms_threshold)
			
 
				-{
			
 
				-    // checkRuntime(cudaSetDevice(gpu_id));
			
 
				-    return std::shared_ptr<Yolov5InferImpl>((Yolov5InferImpl *)(new Yolov5InferImpl(engine_file, names, confidence_threshold, nms_threshold)));
			
 
				-}
			
 
				-
			
 
				-
			
 
				-}
			
 
				-
			
 
				-#endif // YOLO_HPP__
			
--- a/src/infer/slice/slice.cu
+++ b/src/infer/slice/slice.cu
@@ -0,0 +1,297 @@
 
				+#include "infer/slice/slice.hpp"
			
 
				+#include "common/check.hpp"
			
 
				+#include <cmath>
			
 
				+
			
 
				+static __global__ void slice_kernel(
			
 
				+  const uchar3* __restrict__ image,
			
 
				+  uchar3* __restrict__ outs,
			
 
				+  const int width,
			
 
				+  const int height,
			
 
				+  const int slice_width,
			
 
				+  const int slice_height,
			
 
				+  const int slice_num_h,
			
 
				+  const int slice_num_v,
			
 
				+  const int* __restrict__ slice_start_point)
			
 
				+{
			
 
				+    const int slice_idx = blockIdx.z;
			
 
				+
			
 
				+    const int start_x = slice_start_point[slice_idx * 2];
			
 
				+    const int start_y = slice_start_point[slice_idx * 2 + 1];
			
 
				+
			
 
				+    // 当前像素在切片内的相对位置
			
 
				+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				+
			
 
				+    if(x >= slice_width || y >= slice_height) 
			
 
				+        return;
			
 
				+
			
 
				+    const int dx = start_x + x;
			
 
				+    const int dy = start_y + y;
			
 
				+
			
 
				+    if(dx >= width || dy >= height) 
			
 
				+        return;
			
 
				+
			
 
				+    // 读取像素
			
 
				+    const int src_index = dy * width + dx;
			
 
				+    const uchar3 pixel = image[src_index];
			
 
				+
			
 
				+    // 写入切片
			
 
				+    const int dst_index = slice_idx * slice_width * slice_height + y * slice_width + x;
			
 
				+    outs[dst_index] = pixel;
			
 
				+}
			
 
				+
			
 
				+static void slice_plane(const uint8_t* image,
			
 
				+    uint8_t* outs,
			
 
				+    int* slice_start_point,
			
 
				+    const int width,
			
 
				+    const int height,
			
 
				+    const int slice_width,
			
 
				+    const int slice_height,
			
 
				+    const int slice_num_h,
			
 
				+    const int slice_num_v,
			
 
				+    void* stream=nullptr)
			
 
				+{
			
 
				+    int slice_total = slice_num_h * slice_num_v;
			
 
				+    cudaStream_t stream_ = (cudaStream_t)stream;
			
 
				+    dim3 block(32, 32);
			
 
				+    dim3 grid(
			
 
				+        (slice_width + block.x - 1) / block.x,
			
 
				+        (slice_height + block.y - 1) / block.y,
			
 
				+        slice_total
			
 
				+    );
			
 
				+    slice_kernel<<<grid, block, 0, stream_>>>(
			
 
				+        reinterpret_cast<const uchar3*>(image),
			
 
				+        reinterpret_cast<uchar3*>(outs),
			
 
				+        width, height, 
			
 
				+        slice_width, slice_height, 
			
 
				+        slice_num_h, slice_num_v, 
			
 
				+        slice_start_point
			
 
				+    );
			
 
				+}
			
 
				+
			
 
				+
			
 
				+namespace slice
			
 
				+{
			
 
				+
			
 
				+int calculateNumCuts(int dimension, int subDimension, float overlapRatio) {
			
 
				+    float step = subDimension * (1 - overlapRatio);
			
 
				+    if(step == 0)
			
 
				+    {
			
 
				+        return 1;
			
 
				+    }
			
 
				+    float cuts = static_cast<float>(dimension - subDimension) / step;
			
 
				+    // 浮点数会有很小的误差，直接向上取整会出现多裁剪了一张图的情况
			
 
				+    if (fabs(cuts - round(cuts)) < 0.0001) {
			
 
				+        cuts = round(cuts);
			
 
				+    }
			
 
				+    int numCuts = static_cast<int>(std::ceil(cuts));
			
 
				+    return numCuts + 1;
			
 
				+}
			
 
				+
			
 
				+static int calc_resolution_factor(int resolution)
			
 
				+{
			
 
				+    int expo = 0;
			
 
				+    while(pow(2, expo) < resolution) expo++;
			
 
				+    return expo - 1;
			
 
				+} 
			
 
				+
			
 
				+static std::string calc_aspect_ratio_orientation(int width, int height)
			
 
				+{
			
 
				+    if (width < height)
			
 
				+        return  "vertical";
			
 
				+    else if(width > height)
			
 
				+        return "horizontal";
			
 
				+    else
			
 
				+        return "square";
			
 
				+}
			
 
				+
			
 
				+static std::tuple<int, int, float, float> calc_ratio_and_slice(const std::string& orientation, int slide=1, float ratio=0.1)
			
 
				+{
			
 
				+    int slice_row, slice_col;
			
 
				+    float overlap_height_ratio, overlap_width_ratio;
			
 
				+    if (orientation == "vertical")
			
 
				+    {
			
 
				+        slice_row = slide;
			
 
				+        slice_col = slide * 2;
			
 
				+        overlap_height_ratio = ratio;
			
 
				+        overlap_width_ratio = ratio;
			
 
				+    }
			
 
				+    else if (orientation == "horizontal")
			
 
				+    {
			
 
				+        slice_row = slide * 2;
			
 
				+        slice_col = slide;
			
 
				+        overlap_height_ratio = ratio;
			
 
				+        overlap_width_ratio = ratio;
			
 
				+    }
			
 
				+    else if (orientation == "square")
			
 
				+    {
			
 
				+        slice_row = slide;
			
 
				+        slice_col = slide;
			
 
				+        overlap_height_ratio = ratio;
			
 
				+        overlap_width_ratio = ratio;
			
 
				+    }
			
 
				+    return std::make_tuple(slice_row, slice_col, overlap_height_ratio, overlap_width_ratio);
			
 
				+}
			
 
				+
			
 
				+static std::tuple<int, int, float, float> calc_slice_and_overlap_params(
			
 
				+    const std::string& resolution, int width, int height, std::string orientation)
			
 
				+{
			
 
				+    int split_row, split_col;
			
 
				+    float overlap_height_ratio, overlap_width_ratio;
			
 
				+    if (resolution == "medium")
			
 
				+        std::tie(split_row, split_col, overlap_height_ratio, overlap_width_ratio) = calc_ratio_and_slice(
			
 
				+            orientation, 1, 0.8
			
 
				+        );
			
 
				+
			
 
				+    else if (resolution == "high")
			
 
				+        std::tie(split_row, split_col, overlap_height_ratio, overlap_width_ratio) = calc_ratio_and_slice(
			
 
				+            orientation, 2, 0.4
			
 
				+        );
			
 
				+
			
 
				+    else if (resolution == "ultra-high")
			
 
				+        std::tie(split_row, split_col, overlap_height_ratio, overlap_width_ratio) = calc_ratio_and_slice(
			
 
				+            orientation, 4, 0.4
			
 
				+        );
			
 
				+    else
			
 
				+    {
			
 
				+        split_col = 1;
			
 
				+        split_row = 1;
			
 
				+        overlap_width_ratio = 1;
			
 
				+        overlap_height_ratio = 1;
			
 
				+    }
			
 
				+    int slice_height = height / split_col;
			
 
				+    int slice_width = width / split_row;
			
 
				+    return std::make_tuple(slice_width, slice_height, overlap_height_ratio, overlap_width_ratio);
			
 
				+}
			
 
				+
			
 
				+static std::tuple<int, int, float, float> get_resolution_selector(const std::string& resolution, int width, int height)
			
 
				+{
			
 
				+    std::string orientation = calc_aspect_ratio_orientation(width, height);
			
 
				+    return calc_slice_and_overlap_params(resolution, width, height, orientation);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static std::tuple<int, int, float, float> get_auto_slice_params(int width, int height)
			
 
				+{
			
 
				+    int resolution = height * width;
			
 
				+    int factor = calc_resolution_factor(resolution);
			
 
				+    if (factor <= 18)
			
 
				+        return get_resolution_selector("low", width, height);
			
 
				+    else if (18 <= factor && factor < 21)
			
 
				+        return get_resolution_selector("medium", width, height);
			
 
				+    else if (21 <= factor && factor < 24)
			
 
				+        return get_resolution_selector("high", width, height);
			
 
				+    else
			
 
				+        return get_resolution_selector("ultra-high", width, height);
			
 
				+}
			
 
				+
			
 
				+void SliceImage::autoSlice(
			
 
				+        const tensor::Image& image,
			
 
				+        void* stream)
			
 
				+{
			
 
				+    int slice_width;
			
 
				+    int slice_height;
			
 
				+    float overlap_width_ratio;
			
 
				+    float overlap_height_ratio;
			
 
				+    std::tie(slice_width, slice_height, overlap_width_ratio, overlap_height_ratio) = get_auto_slice_params(image.width, image.height);
			
 
				+    slice(image, slice_width, slice_height, overlap_width_ratio, overlap_height_ratio, stream);
			
 
				+}
			
 
				+
			
 
				+void SliceImage::slice(
			
 
				+        const tensor::Image& image, 
			
 
				+        const int slice_width,
			
 
				+        const int slice_height,
			
 
				+        const float overlap_width_ratio,
			
 
				+        const float overlap_height_ratio,
			
 
				+        void* stream)
			
 
				+{
			
 
				+    slice_width_  = slice_width;
			
 
				+    slice_height_ = slice_height;
			
 
				+    cudaStream_t stream_ = (cudaStream_t)stream;
			
 
				+
			
 
				+    int width = image.width;
			
 
				+    int height = image.height;
			
 
				+
			
 
				+    slice_num_h_ = calculateNumCuts(width, slice_width, overlap_width_ratio);
			
 
				+    slice_num_v_ = calculateNumCuts(height, slice_height, overlap_height_ratio);
			
 
				+    /*
			
 
				+    printf("------------------------------------------------------\n"
			
 
				+           "CUDA SAHI CROP IMAGE ✂️\n"
			
 
				+         "------------------------------------------------------\n"
			
 
				+         "%-30s: %-10d\n"
			
 
				+         "%-30s: %-10d\n"
			
 
				+         "%-30s: %-10.2f\n"
			
 
				+         "%-30s: %-10.2f\n"
			
 
				+         "%-30s: %-10d\n"
			
 
				+         "%-30s: %-10d\n"
			
 
				+         "------------------------------------------------------\n", 
			
 
				+         "Slice width", slice_width_,
			
 
				+         "Slice height", slice_height_,
			
 
				+         "Overlap width ratio", overlap_width_ratio,
			
 
				+         "Overlap height ratio", overlap_height_ratio,
			
 
				+         "Number of horizontal cuts", slice_num_h_,
			
 
				+         "Number of vertical cuts", slice_num_v_);
			
 
				+    */
			
 
				+    int slice_num            = slice_num_h_ * slice_num_v_;
			
 
				+    int overlap_width_pixel  = slice_width  * overlap_width_ratio;
			
 
				+    int overlap_height_pixel = slice_height * overlap_height_ratio;
			
 
				+
			
 
				+    size_t size_image = 3 * width * height;
			
 
				+    size_t output_img_size = 3 * slice_width * slice_height;
			
 
				+
			
 
				+    input_image_.gpu(size_image);
			
 
				+    output_images_.gpu(slice_num * output_img_size);
			
 
				+    checkRuntime(cudaMemsetAsync(output_images_.gpu(), 114, output_images_.gpu_bytes(), stream_));
			
 
				+
			
 
				+    checkRuntime(cudaMemcpyAsync(input_image_.gpu(), image.bgrptr, size_image, cudaMemcpyHostToDevice, stream_));
			
 
				+    // checkRuntime(cudaStreamSynchronize(stream_));
			
 
				+
			
 
				+    uint8_t* input_device = input_image_.gpu();
			
 
				+    uint8_t* output_device = output_images_.gpu();
			
 
				+
			
 
				+    slice_start_point_.cpu(slice_num * 2);
			
 
				+    slice_start_point_.gpu(slice_num * 2);
			
 
				+
			
 
				+    int* slice_start_point_ptr = slice_start_point_.cpu();
			
 
				+    
			
 
				+    for (int i = 0; i < slice_num_h_; i++)
			
 
				+    {
			
 
				+        int x = std::min(width - slice_width, std::max(0, i * (slice_width - overlap_width_pixel)));
			
 
				+        for (int j = 0; j < slice_num_v_; j++)
			
 
				+        {
			
 
				+            int y = std::min(height - slice_height, std::max(0, j * (slice_height - overlap_height_pixel)));
			
 
				+            int index = (i * slice_num_v_ + j) * 2;
			
 
				+            slice_start_point_ptr[index] = x;
			
 
				+            slice_start_point_ptr[index + 1] = y;
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    checkRuntime(cudaMemcpyAsync(slice_start_point_.gpu(), slice_start_point_.cpu(), slice_num*2*sizeof(int), cudaMemcpyHostToDevice, stream_));
			
 
				+    checkRuntime(cudaStreamSynchronize(stream_));
			
 
				+    slice_plane(
			
 
				+        input_device, output_device, slice_start_point_.gpu(),
			
 
				+        width, height, 
			
 
				+        slice_width, slice_height, 
			
 
				+        slice_num_h_, slice_num_v_,
			
 
				+        stream);
			
 
				+
			
 
				+    // checkRuntime(cudaStreamSynchronize(stream_));
			
 
				+
			
 
				+    // for (int i = 0; i < slice_num_h_; i++)
			
 
				+    // {
			
 
				+    //     for (int j = 0; j < slice_num_v_; j++)
			
 
				+    //     {
			
 
				+    //         int index = i * slice_num_v_ + j;
			
 
				+    //         slice_position_[index*2]   = slice_start_point_ptr[index*2];
			
 
				+    //         slice_position_[index*2+1] = slice_start_point_ptr[index*2+1];
			
 
				+
			
 
				+    //         // cv::Mat image = cv::Mat::zeros(slice_height, slice_width, CV_8UC3);
			
 
				+    //         // uint8_t* output_img_data = image.ptr<uint8_t>();
			
 
				+    //         // cudaMemcpyAsync(output_img_data, output_device+index*output_img_size, output_img_size*sizeof(uint8_t), cudaMemcpyDeviceToHost, stream_);
			
 
				+    //         // checkRuntime(cudaStreamSynchronize(stream_));
			
 
				+    //         // cv::imwrite(std::to_string(index) + ".png", image);
			
 
				+    //     }
			
 
				+    // }
			
 
				+}
			
 
				+
			
 
				+}
			
--- a/src/infer/slice/slice.hpp
+++ b/src/infer/slice/slice.hpp
@@ -0,0 +1,48 @@
 
				+#ifndef SLICE_HPP__
			
 
				+#define SLICE_HPP__
			
 
				+
			
 
				+#include "opencv2/opencv.hpp"
			
 
				+#include "common/image.hpp"
			
 
				+#include "common/memory.hpp"
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace slice
			
 
				+{
			
 
				+
			
 
				+int calculateNumCuts(int dimension, int subDimension, float overlapRatio);
			
 
				+
			
 
				+class SliceImage{
			
 
				+public:
			
 
				+    tensor::Memory<unsigned char> input_image_;
			
 
				+    tensor::Memory<unsigned char> output_images_;
			
 
				+
			
 
				+    tensor::Memory<int> slice_start_point_;
			
 
				+
			
 
				+    int slice_num_h_;
			
 
				+    int slice_num_v_;
			
 
				+
			
 
				+    int slice_width_;
			
 
				+    int slice_height_;
			
 
				+
			
 
				+    // std::vector<int> slice_position_;
			
 
				+
			
 
				+public:
			
 
				+    void slice(
			
 
				+        const tensor::Image& image, 
			
 
				+        const int slice_width,
			
 
				+        const int slice_height, 
			
 
				+        const float overlap_width_ratio,
			
 
				+        const float overlap_height_ratio,
			
 
				+        void* stream=nullptr);
			
 
				+    
			
 
				+    void autoSlice(
			
 
				+        const tensor::Image& image, 
			
 
				+        void* stream=nullptr);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#endif
			
--- a/src/infer/trt/affine.cu
+++ b/src/infer/trt/affine.cu
@@ -0,0 +1,138 @@
 
				+#include "infer/trt/affine.hpp"
			
 
				+#include "common/check.hpp"
			
 
				+
			
 
				+namespace affine
			
 
				+{
			
 
				+
			
 
				+Norm Norm::mean_std(const float mean[3], const float std[3], float alpha,
			
 
				+                    ChannelType channel_type) 
			
 
				+{
			
 
				+    Norm out;
			
 
				+    out.type = NormType::MeanStd;
			
 
				+    out.alpha = alpha;
			
 
				+    out.channel_type = channel_type;
			
 
				+    memcpy(out.mean, mean, sizeof(out.mean));
			
 
				+    memcpy(out.std, std, sizeof(out.std));
			
 
				+    return out;
			
 
				+}
			
 
				+
			
 
				+Norm Norm::alpha_beta(float alpha, float beta, ChannelType channel_type) 
			
 
				+{
			
 
				+    Norm out;
			
 
				+    out.type = NormType::AlphaBeta;
			
 
				+    out.alpha = alpha;
			
 
				+    out.beta = beta;
			
 
				+    out.channel_type = channel_type;
			
 
				+    return out;
			
 
				+}
			
 
				+
			
 
				+Norm Norm::None() { return Norm(); }
			
 
				+
			
 
				+
			
 
				+static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(
			
 
				+    uint8_t *src, int src_line_size, int src_width, int src_height, float *dst, int dst_width,
			
 
				+    int dst_height, uint8_t const_value_st, float *warp_affine_matrix_2_3, Norm norm) 
			
 
				+{
			
 
				+    int dx = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				+    int dy = blockDim.y * blockIdx.y + threadIdx.y;
			
 
				+    if (dx >= dst_width || dy >= dst_height) return;
			
 
				+
			
 
				+    float m_x1 = warp_affine_matrix_2_3[0];
			
 
				+    float m_y1 = warp_affine_matrix_2_3[1];
			
 
				+    float m_z1 = warp_affine_matrix_2_3[2];
			
 
				+    float m_x2 = warp_affine_matrix_2_3[3];
			
 
				+    float m_y2 = warp_affine_matrix_2_3[4];
			
 
				+    float m_z2 = warp_affine_matrix_2_3[5];
			
 
				+
			
 
				+    float src_x = m_x1 * dx + m_y1 * dy + m_z1;
			
 
				+    float src_y = m_x2 * dx + m_y2 * dy + m_z2;
			
 
				+    float c0, c1, c2;
			
 
				+
			
 
				+    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) 
			
 
				+    {
			
 
				+        // out of range
			
 
				+        c0 = const_value_st;
			
 
				+        c1 = const_value_st;
			
 
				+        c2 = const_value_st;
			
 
				+    } 
			
 
				+    else 
			
 
				+    {
			
 
				+        int y_low = floorf(src_y);
			
 
				+        int x_low = floorf(src_x);
			
 
				+        int y_high = y_low + 1;
			
 
				+        int x_high = x_low + 1;
			
 
				+
			
 
				+        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
			
 
				+        float ly = src_y - y_low;
			
 
				+        float lx = src_x - x_low;
			
 
				+        float hy = 1 - ly;
			
 
				+        float hx = 1 - lx;
			
 
				+        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
			
 
				+        uint8_t *v1 = const_value;
			
 
				+        uint8_t *v2 = const_value;
			
 
				+        uint8_t *v3 = const_value;
			
 
				+        uint8_t *v4 = const_value;
			
 
				+        if (y_low >= 0) 
			
 
				+        {
			
 
				+            if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3;
			
 
				+
			
 
				+            if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3;
			
 
				+        }
			
 
				+
			
 
				+        if (y_high < src_height) 
			
 
				+        {
			
 
				+            if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3;
			
 
				+
			
 
				+            if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3;
			
 
				+        }
			
 
				+
			
 
				+        // same to opencv
			
 
				+        c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
			
 
				+        c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
			
 
				+        c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
			
 
				+    }
			
 
				+
			
 
				+    if (norm.channel_type == ChannelType::SwapRB) 
			
 
				+    {
			
 
				+        float t = c2;
			
 
				+        c2 = c0;
			
 
				+        c0 = t;
			
 
				+    }
			
 
				+
			
 
				+    if (norm.type == NormType::MeanStd) 
			
 
				+    {
			
 
				+        c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
			
 
				+        c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
			
 
				+        c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
			
 
				+    } 
			
 
				+    else if (norm.type == NormType::AlphaBeta) 
			
 
				+    {
			
 
				+        c0 = c0 * norm.alpha + norm.beta;
			
 
				+        c1 = c1 * norm.alpha + norm.beta;
			
 
				+        c2 = c2 * norm.alpha + norm.beta;
			
 
				+    }
			
 
				+
			
 
				+    int area = dst_width * dst_height;
			
 
				+    float *pdst_c0 = dst + dy * dst_width + dx;
			
 
				+    float *pdst_c1 = pdst_c0 + area;
			
 
				+    float *pdst_c2 = pdst_c1 + area;
			
 
				+    *pdst_c0 = c0;
			
 
				+    *pdst_c1 = c1;
			
 
				+    *pdst_c2 = c2;
			
 
				+}
			
 
				+
			
 
				+void warp_affine_bilinear_and_normalize_plane(uint8_t *src, int src_line_size, int src_width,
			
 
				+                                                    int src_height, float *dst, int dst_width,
			
 
				+                                                    int dst_height, float *matrix_2_3,
			
 
				+                                                    uint8_t const_value, const Norm &norm,
			
 
				+                                                    cudaStream_t stream) 
			
 
				+{
			
 
				+    dim3 grid((dst_width + 31) / 32, (dst_height + 31) / 32);
			
 
				+    dim3 block(32, 32);
			
 
				+
			
 
				+    checkKernel(warp_affine_bilinear_and_normalize_plane_kernel<<<grid, block, 0, stream>>>(
			
 
				+        src, src_line_size, src_width, src_height, dst, dst_width, dst_height, const_value,
			
 
				+        matrix_2_3, norm));
			
 
				+}
			
 
				+
			
 
				+} // namespace affine
			
--- a/src/infer/trt/affine.hpp
+++ b/src/infer/trt/affine.hpp
@@ -0,0 +1,112 @@
 
				+#ifndef AFFINE_HPP__
			
 
				+#define AFFINE_HPP__
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <cuda_runtime.h>
			
 
				+
			
 
				+namespace affine
			
 
				+{
			
 
				+
			
 
				+enum class NormType : int { None = 0, MeanStd = 1, AlphaBeta = 2 };
			
 
				+
			
 
				+enum class ChannelType : int { None = 0, SwapRB = 1 };
			
 
				+
			
 
				+struct Norm 
			
 
				+{
			
 
				+    float mean[3];
			
 
				+    float std[3];
			
 
				+    float alpha, beta;
			
 
				+    NormType type = NormType::None;
			
 
				+    ChannelType channel_type = ChannelType::None;
			
 
				+
			
 
				+    // out = (x * alpha - mean) / std
			
 
				+    static Norm mean_std(const float mean[3], const float std[3], float alpha = 1 / 255.0f,
			
 
				+                        ChannelType channel_type = ChannelType::None);
			
 
				+
			
 
				+    // out = x * alpha + beta
			
 
				+    static Norm alpha_beta(float alpha, float beta = 0, ChannelType channel_type = ChannelType::None);
			
 
				+
			
 
				+    // None
			
 
				+    static Norm None();
			
 
				+};
			
 
				+
			
 
				+struct ResizeMatrix 
			
 
				+{
			
 
				+    float i2d[6];  // image to dst(network), 2x3 matrix
			
 
				+    float d2i[6];  // dst to image, 2x3 matrix
			
 
				+
			
 
				+    void compute(const std::tuple<int, int> &from, const std::tuple<int, int> &to) 
			
 
				+    {
			
 
				+        float scale_x = std::get<0>(to) / (float)std::get<0>(from);
			
 
				+        float scale_y = std::get<1>(to) / (float)std::get<1>(from);
			
 
				+        float scale = std::min(scale_x, scale_y);
			
 
				+
			
 
				+        // resize 
			
 
				+        i2d[0] = scale_x;
			
 
				+        i2d[1] = 0;
			
 
				+        i2d[2] = 0;
			
 
				+        i2d[3] = 0;
			
 
				+        i2d[4] = scale_y;
			
 
				+        i2d[5] = 0;
			
 
				+
			
 
				+
			
 
				+        double D = i2d[0] * i2d[4] - i2d[1] * i2d[3];
			
 
				+        D = D != 0. ? double(1.) / D : double(0.);
			
 
				+        double A11 = i2d[4] * D, A22 = i2d[0] * D, A12 = -i2d[1] * D, A21 = -i2d[3] * D;
			
 
				+        double b1 = -A11 * i2d[2] - A12 * i2d[5];
			
 
				+        double b2 = -A21 * i2d[2] - A22 * i2d[5];
			
 
				+
			
 
				+        d2i[0] = A11;
			
 
				+        d2i[1] = A12;
			
 
				+        d2i[2] = b1;
			
 
				+        d2i[3] = A21;
			
 
				+        d2i[4] = A22;
			
 
				+        d2i[5] = b2;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+struct LetterBoxMatrix 
			
 
				+{
			
 
				+    float i2d[6];  // image to dst(network), 2x3 matrix
			
 
				+    float d2i[6];  // dst to image, 2x3 matrix
			
 
				+
			
 
				+    void compute(const std::tuple<int, int> &from, const std::tuple<int, int> &to) 
			
 
				+    {
			
 
				+        float scale_x = std::get<0>(to) / (float)std::get<0>(from);
			
 
				+        float scale_y = std::get<1>(to) / (float)std::get<1>(from);
			
 
				+        float scale = std::min(scale_x, scale_y);
			
 
				+
			
 
				+        // letter box
			
 
				+        i2d[0] = scale;
			
 
				+        i2d[1] = 0;
			
 
				+        i2d[2] = -scale * std::get<0>(from) * 0.5 + std::get<0>(to) * 0.5 + scale * 0.5 - 0.5;
			
 
				+        i2d[3] = 0;
			
 
				+        i2d[4] = scale;
			
 
				+        i2d[5] = -scale * std::get<1>(from) * 0.5 + std::get<1>(to) * 0.5 + scale * 0.5 - 0.5;
			
 
				+
			
 
				+
			
 
				+        double D = i2d[0] * i2d[4] - i2d[1] * i2d[3];
			
 
				+        D = D != 0. ? double(1.) / D : double(0.);
			
 
				+        double A11 = i2d[4] * D, A22 = i2d[0] * D, A12 = -i2d[1] * D, A21 = -i2d[3] * D;
			
 
				+        double b1 = -A11 * i2d[2] - A12 * i2d[5];
			
 
				+        double b2 = -A21 * i2d[2] - A22 * i2d[5];
			
 
				+
			
 
				+        d2i[0] = A11;
			
 
				+        d2i[1] = A12;
			
 
				+        d2i[2] = b1;
			
 
				+        d2i[3] = A21;
			
 
				+        d2i[4] = A22;
			
 
				+        d2i[5] = b2;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+void warp_affine_bilinear_and_normalize_plane(uint8_t *src, int src_line_size, int src_width,
			
 
				+                                                int src_height, float *dst, int dst_width,
			
 
				+                                                int dst_height, float *matrix_2_3,
			
 
				+                                                uint8_t const_value, const Norm &norm,
			
 
				+                                                cudaStream_t stream);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif // AFFINE_HPP__
			
--- a/src/infer/trt/yolo.cu
+++ b/src/infer/trt/yolo.cu
@@ -0,0 +1,564 @@
 
				+#include "model/yolo.hpp"
			
 
				+#include <vector>
			
 
				+#include <memory>
			
 
				+#include "slice/slice.hpp"
			
 
				+#include "model/affine.hpp"
			
 
				+#include "common/check.hpp"
			
 
				+
			
 
				+#define GPU_BLOCK_THREADS 512
			
 
				+
			
 
				+namespace yolo
			
 
				+{
			
 
				+
			
 
				+static const int NUM_BOX_ELEMENT = 8;  // left, top, right, bottom, confidence, class, keepflag, row_index(output)
			
 
				+static const int MAX_IMAGE_BOXES = 1024 * 4;
			
 
				+
			
 
				+static const int KEY_POINT_NUM   = 17; // 关键点数量
			
 
				+
			
 
				+static dim3 grid_dims(int numJobs){
			
 
				+  int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
			
 
				+  return dim3(((numJobs + numBlockThreads - 1) / (float)numBlockThreads));
			
 
				+}
			
 
				+
			
 
				+static dim3 block_dims(int numJobs){
			
 
				+  return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
			
 
				+}
			
 
				+
			
 
				+static __host__ __device__ void affine_project(float *matrix, float x, float y, float *ox, float *oy) 
			
 
				+{
			
 
				+    *ox = matrix[0] * x + matrix[1] * y + matrix[2];
			
 
				+    *oy = matrix[3] * x + matrix[4] * y + matrix[5];
			
 
				+}
			
 
				+
			
 
				+static __global__ void decode_kernel_v5(float *predict, int num_bboxes, int num_classes,
			
 
				+                                              int output_cdim, float confidence_threshold,
			
 
				+                                              float *invert_affine_matrix, float *parray, int *box_count,
			
 
				+                                              int max_image_boxes, int start_x, int start_y) 
			
 
				+{
			
 
				+    int position = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				+    if (position >= num_bboxes) return;
			
 
				+
			
 
				+    float *pitem = predict + output_cdim * position;
			
 
				+    float objectness = pitem[4];
			
 
				+    if (objectness < confidence_threshold) return;
			
 
				+
			
 
				+    float *class_confidence = pitem + 5;
			
 
				+    
			
 
				+    float confidence = *class_confidence++;
			
 
				+    int label = 0;
			
 
				+    for (int i = 1; i < num_classes; ++i, ++class_confidence) 
			
 
				+    {
			
 
				+        if (*class_confidence > confidence) 
			
 
				+        {
			
 
				+            confidence = *class_confidence;
			
 
				+            label = i;
			
 
				+        }
			
 
				+    }
			
 
				+    confidence *= objectness;
			
 
				+    if (confidence < confidence_threshold) return;
			
 
				+    
			
 
				+    int index = atomicAdd(box_count, 1);
			
 
				+    if (index >= max_image_boxes) return;
			
 
				+
			
 
				+    float cx = *pitem++;
			
 
				+    float cy = *pitem++;
			
 
				+    float width = *pitem++;
			
 
				+    float height = *pitem++;
			
 
				+    float left = cx - width * 0.5f;
			
 
				+    float top = cy - height * 0.5f;
			
 
				+    float right = cx + width * 0.5f;
			
 
				+    float bottom = cy + height * 0.5f;
			
 
				+    affine_project(invert_affine_matrix, left, top, &left, &top);
			
 
				+    affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
			
 
				+
			
 
				+    float *pout_item = parray + index * NUM_BOX_ELEMENT;
			
 
				+    *pout_item++ = left + start_x;
			
 
				+    *pout_item++ = top + start_y;
			
 
				+    *pout_item++ = right + start_x;
			
 
				+    *pout_item++ = bottom + start_y;
			
 
				+    *pout_item++ = confidence;
			
 
				+    *pout_item++ = label;
			
 
				+    *pout_item++ = 1;  // 1 = keep, 0 = ignore
			
 
				+    *pout_item++ = position;
			
 
				+}
			
 
				+
			
 
				+static __global__ void decode_kernel_v8(float *predict, int num_bboxes, int num_classes,
			
 
				+                                              int output_cdim, float confidence_threshold,
			
 
				+                                              float *invert_affine_matrix, float *parray, int *box_count,
			
 
				+                                              int max_image_boxes, int start_x, int start_y) 
			
 
				+{
			
 
				+    int position = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				+    if (position >= num_bboxes) return;
			
 
				+
			
 
				+    float *pitem = predict + output_cdim * position;
			
 
				+    float *class_confidence = pitem + 4;
			
 
				+    float confidence = *class_confidence++;
			
 
				+    int label = 0;
			
 
				+    for (int i = 1; i < num_classes; ++i, ++class_confidence) 
			
 
				+    {
			
 
				+        if (*class_confidence > confidence) 
			
 
				+        {
			
 
				+            confidence = *class_confidence;
			
 
				+            label = i;
			
 
				+        }
			
 
				+    }
			
 
				+    if (confidence < confidence_threshold) return;
			
 
				+
			
 
				+    int index = atomicAdd(box_count, 1);
			
 
				+    if (index >= max_image_boxes) return;
			
 
				+
			
 
				+    float cx = *pitem++;
			
 
				+    float cy = *pitem++;
			
 
				+    float width = *pitem++;
			
 
				+    float height = *pitem++;
			
 
				+    float left = cx - width * 0.5f;
			
 
				+    float top = cy - height * 0.5f;
			
 
				+    float right = cx + width * 0.5f;
			
 
				+    float bottom = cy + height * 0.5f;
			
 
				+    affine_project(invert_affine_matrix, left, top, &left, &top);
			
 
				+    affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
			
 
				+
			
 
				+    float *pout_item = parray + index * NUM_BOX_ELEMENT;
			
 
				+    *pout_item++ = left + start_x;
			
 
				+    *pout_item++ = top + start_y;
			
 
				+    *pout_item++ = right + start_x;
			
 
				+    *pout_item++ = bottom + start_y;
			
 
				+    *pout_item++ = confidence;
			
 
				+    *pout_item++ = label;
			
 
				+    *pout_item++ = 1;  // 1 = keep, 0 = ignore
			
 
				+    *pout_item++ = position;
			
 
				+}
			
 
				+
			
 
				+static __global__ void decode_kernel_11pose(float *predict, int num_bboxes, int num_classes,
			
 
				+    int output_cdim, float confidence_threshold,
			
 
				+    float *invert_affine_matrix, float *parray,
			
 
				+    int *box_count, int max_image_boxes, int start_x, int start_y) 
			
 
				+{
			
 
				+    int position = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				+    if (position >= num_bboxes) return;
			
 
				+
			
 
				+    float *pitem            = predict + output_cdim * position;
			
 
				+    float *class_confidence = pitem + 4;
			
 
				+    float *key_points       = pitem + 4 + num_classes;
			
 
				+    float confidence        = *class_confidence++;
			
 
				+
			
 
				+    int label = 0;
			
 
				+    for (int i = 1; i < num_classes; ++i, ++class_confidence) 
			
 
				+    {
			
 
				+        if (*class_confidence > confidence) 
			
 
				+        {
			
 
				+            confidence = *class_confidence;
			
 
				+            label = i;
			
 
				+        }
			
 
				+    }
			
 
				+    if (confidence < confidence_threshold) return;
			
 
				+
			
 
				+    int index = atomicAdd(box_count, 1);
			
 
				+    if (index >= max_image_boxes) return;
			
 
				+
			
 
				+    float cx = *pitem++;
			
 
				+    float cy = *pitem++;
			
 
				+    float width = *pitem++;
			
 
				+    float height = *pitem++;
			
 
				+    float left = cx - width * 0.5f;
			
 
				+    float top = cy - height * 0.5f;
			
 
				+    float right = cx + width * 0.5f;
			
 
				+    float bottom = cy + height * 0.5f;
			
 
				+    affine_project(invert_affine_matrix, left, top, &left, &top);
			
 
				+    affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
			
 
				+
			
 
				+    float *pout_item = parray + index * (NUM_BOX_ELEMENT + KEY_POINT_NUM * 3);
			
 
				+    *pout_item++ = left + start_x;
			
 
				+    *pout_item++ = top + start_y;
			
 
				+    *pout_item++ = right + start_x;
			
 
				+    *pout_item++ = bottom + start_y;
			
 
				+    *pout_item++ = confidence;
			
 
				+    *pout_item++ = label;
			
 
				+    *pout_item++ = 1;  // 1 = keep, 0 = ignore
			
 
				+    *pout_item++ = position;
			
 
				+    for (int i = 0; i < KEY_POINT_NUM; i++)
			
 
				+    {
			
 
				+        float x = *key_points++;
			
 
				+        float y = *key_points++;
			
 
				+        affine_project(invert_affine_matrix, x, y, &x, &y);
			
 
				+        float score  = *key_points++;
			
 
				+        *pout_item++ = x + start_x;
			
 
				+        *pout_item++ = y + start_y;
			
 
				+        *pout_item++ = score;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft,
			
 
				+                                float btop, float bright, float bbottom)
			
 
				+{
			
 
				+    float cleft = max(aleft, bleft);
			
 
				+    float ctop = max(atop, btop);
			
 
				+    float cright = min(aright, bright);
			
 
				+    float cbottom = min(abottom, bbottom);
			
 
				+
			
 
				+    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
			
 
				+    if (c_area == 0.0f) return 0.0f;
			
 
				+
			
 
				+    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
			
 
				+    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
			
 
				+    return c_area / (a_area + b_area - c_area);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static __global__ void fast_nms_kernel(float *bboxes, int* box_count, int max_image_boxes, float threshold) 
			
 
				+{
			
 
				+    int position = (blockDim.x * blockIdx.x + threadIdx.x);
			
 
				+    int count = min((int)*box_count, MAX_IMAGE_BOXES);
			
 
				+    if (position >= count) return;
			
 
				+
			
 
				+    // left, top, right, bottom, confidence, class, keepflag
			
 
				+    float *pcurrent = bboxes + position * NUM_BOX_ELEMENT;
			
 
				+    for (int i = 0; i < count; ++i) 
			
 
				+    {
			
 
				+        float *pitem = bboxes + i * NUM_BOX_ELEMENT;
			
 
				+        if (i == position || pcurrent[5] != pitem[5]) continue;
			
 
				+
			
 
				+        if (pitem[4] >= pcurrent[4]) 
			
 
				+        {
			
 
				+            if (pitem[4] == pcurrent[4] && i < position) continue;
			
 
				+
			
 
				+            float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1],
			
 
				+                                pitem[2], pitem[3]);
			
 
				+
			
 
				+            if (iou > threshold) 
			
 
				+            {
			
 
				+                pcurrent[6] = 0;  // 1=keep, 0=ignore
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static __global__ void fast_nms_pose_kernel(float *bboxes, int* box_count, int max_image_boxes, float threshold) 
			
 
				+{
			
 
				+    int position = (blockDim.x * blockIdx.x + threadIdx.x);
			
 
				+    int count = min((int)*box_count, MAX_IMAGE_BOXES);
			
 
				+    if (position >= count) return;
			
 
				+
			
 
				+    // left, top, right, bottom, confidence, class, keepflag
			
 
				+    float *pcurrent = bboxes + position * (NUM_BOX_ELEMENT + KEY_POINT_NUM * 3);
			
 
				+    for (int i = 0; i < count; ++i) 
			
 
				+    {
			
 
				+        float *pitem = bboxes + i * (NUM_BOX_ELEMENT + KEY_POINT_NUM * 3);
			
 
				+        if (i == position || pcurrent[5] != pitem[5]) continue;
			
 
				+
			
 
				+        if (pitem[4] >= pcurrent[4]) 
			
 
				+        {
			
 
				+            if (pitem[4] == pcurrent[4] && i < position) continue;
			
 
				+
			
 
				+            float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1],
			
 
				+                                pitem[2], pitem[3]);
			
 
				+
			
 
				+            if (iou > threshold) 
			
 
				+            {
			
 
				+                pcurrent[6] = 0;  // 1=keep, 0=ignore
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void decode_kernel_invoker_v8(float *predict, int num_bboxes, int num_classes, int output_cdim,
			
 
				+                                  float confidence_threshold, float nms_threshold,
			
 
				+                                  float *invert_affine_matrix, float *parray, int* box_count, int max_image_boxes,
			
 
				+                                  int start_x, int start_y, cudaStream_t stream) 
			
 
				+{
			
 
				+    auto grid = grid_dims(num_bboxes);
			
 
				+    auto block = block_dims(num_bboxes);
			
 
				+
			
 
				+    checkKernel(decode_kernel_v8<<<grid, block, 0, stream>>>(
			
 
				+            predict, num_bboxes, num_classes, output_cdim, confidence_threshold, invert_affine_matrix,
			
 
				+            parray, box_count, max_image_boxes, start_x, start_y));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void decode_kernel_invoker_v5(float *predict, int num_bboxes, int num_classes, int output_cdim,
			
 
				+                                  float confidence_threshold, float nms_threshold,
			
 
				+                                  float *invert_affine_matrix, float *parray, int* box_count, int max_image_boxes,
			
 
				+                                  int start_x, int start_y, cudaStream_t stream) 
			
 
				+{
			
 
				+    auto grid = grid_dims(num_bboxes);
			
 
				+    auto block = block_dims(num_bboxes);
			
 
				+
			
 
				+    checkKernel(decode_kernel_v5<<<grid, block, 0, stream>>>(
			
 
				+            predict, num_bboxes, num_classes, output_cdim, confidence_threshold, invert_affine_matrix,
			
 
				+            parray, box_count, max_image_boxes, start_x, start_y));
			
 
				+}
			
 
				+
			
 
				+static void decode_kernel_invoker_v11pose(float *predict, int num_bboxes, int num_classes, int output_cdim,
			
 
				+    float confidence_threshold, float nms_threshold,
			
 
				+    float *invert_affine_matrix, float *parray, int* box_count, int max_image_boxes,
			
 
				+    int start_x, int start_y, cudaStream_t stream) 
			
 
				+{
			
 
				+    auto grid = grid_dims(num_bboxes);
			
 
				+    auto block = block_dims(num_bboxes);
			
 
				+
			
 
				+    checkKernel(decode_kernel_11pose<<<grid, block, 0, stream>>>(
			
 
				+            predict, num_bboxes, num_classes, output_cdim, confidence_threshold, invert_affine_matrix,
			
 
				+            parray, box_count, max_image_boxes, start_x, start_y));
			
 
				+}
			
 
				+
			
 
				+static void fast_nms_kernel_invoker(float *parray, int* box_count, int max_image_boxes, float nms_threshold, cudaStream_t stream)
			
 
				+{
			
 
				+    auto grid = grid_dims(max_image_boxes);
			
 
				+    auto block = block_dims(max_image_boxes);
			
 
				+    checkKernel(fast_nms_kernel<<<grid, block, 0, stream>>>(parray, box_count, max_image_boxes, nms_threshold));
			
 
				+}
			
 
				+
			
 
				+static void fast_nms_pose_kernel_invoker(float *parray, int* box_count, int max_image_boxes, float nms_threshold, cudaStream_t stream)
			
 
				+{
			
 
				+    auto grid = grid_dims(max_image_boxes);
			
 
				+    auto block = block_dims(max_image_boxes);
			
 
				+    checkKernel(fast_nms_pose_kernel<<<grid, block, 0, stream>>>(parray, box_count, max_image_boxes, nms_threshold));
			
 
				+}
			
 
				+
			
 
				+void YoloModelImpl::adjust_memory(int batch_size)
			
 
				+{
			
 
				+    // the inference batch_size
			
 
				+    size_t input_numel = network_input_width_ * network_input_height_ * 3;
			
 
				+    input_buffer_.gpu(batch_size * input_numel);
			
 
				+    bbox_predict_.gpu(batch_size * bbox_head_dims_[1] * bbox_head_dims_[2]);
			
 
				+    output_boxarray_.gpu(MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
			
 
				+    output_boxarray_.cpu(MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
			
 
				+
			
 
				+    affine_matrix_.gpu(6);
			
 
				+    affine_matrix_.cpu(6);
			
 
				+
			
 
				+    box_count_.gpu(1);
			
 
				+    box_count_.cpu(1);
			
 
				+}
			
 
				+
			
 
				+void YoloModelImpl::preprocess(int ibatch, affine::LetterBoxMatrix &affine, void *stream = nullptr)
			
 
				+{
			
 
				+    affine.compute(std::make_tuple(slice_->slice_width_, slice_->slice_height_),
			
 
				+                std::make_tuple(network_input_width_, network_input_height_));
			
 
				+
			
 
				+    size_t input_numel = network_input_width_ * network_input_height_ * 3;
			
 
				+    float *input_device = input_buffer_.gpu() + ibatch * input_numel;
			
 
				+    size_t size_image = slice_->slice_width_ * slice_->slice_height_ * 3;
			
 
				+
			
 
				+    float *affine_matrix_device = affine_matrix_.gpu();
			
 
				+    uint8_t *image_device = slice_->output_images_.gpu() + ibatch * size_image;
			
 
				+
			
 
				+    float *affine_matrix_host = affine_matrix_.cpu();
			
 
				+
			
 
				+    // speed up
			
 
				+    cudaStream_t stream_ = (cudaStream_t)stream;
			
 
				+    memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
			
 
				+    checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
			
 
				+                                cudaMemcpyHostToDevice, stream_));
			
 
				+
			
 
				+    affine::warp_affine_bilinear_and_normalize_plane(image_device, slice_->slice_width_ * 3, slice_->slice_width_,
			
 
				+                                            slice_->slice_height_, input_device, network_input_width_,
			
 
				+                                            network_input_height_, affine_matrix_device, 114,
			
 
				+                                            normalize_, stream_);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+bool YoloModelImpl::load(const std::string &engine_file, ModelType model_type, float confidence_threshold, float nms_threshold)
			
 
				+{
			
 
				+    trt_ = TensorRT::load(engine_file);
			
 
				+    if (trt_ == nullptr) return false;
			
 
				+
			
 
				+    trt_->print();
			
 
				+
			
 
				+    this->confidence_threshold_ = confidence_threshold;
			
 
				+    this->nms_threshold_ = nms_threshold;
			
 
				+    this->model_type_ = model_type;
			
 
				+
			
 
				+    auto input_dim = trt_->static_dims(0);
			
 
				+    bbox_head_dims_ = trt_->static_dims(1);
			
 
				+    network_input_width_ = input_dim[3];
			
 
				+    network_input_height_ = input_dim[2];
			
 
				+    isdynamic_model_ = trt_->has_dynamic_dim();
			
 
				+
			
 
				+    normalize_ = affine::Norm::alpha_beta(1 / 255.0f, 0.0f, affine::ChannelType::SwapRB);
			
 
				+    if (this->model_type_ == ModelType::YOLOV8 || this->model_type_ == ModelType::YOLOV11)
			
 
				+    {
			
 
				+        num_classes_ = bbox_head_dims_[2] - 4;
			
 
				+    }
			
 
				+    else if (this->model_type_ == ModelType::YOLOV5)
			
 
				+    {
			
 
				+        num_classes_ = bbox_head_dims_[2] - 5;
			
 
				+    }
			
 
				+    else if (this->model_type_ == ModelType::YOLOV11POSE)
			
 
				+    {
			
 
				+        num_classes_ = bbox_head_dims_[2] - 4 - KEY_POINT_NUM * 3;
			
 
				+        // NUM_BOX_ELEMENT = 8 + KEY_POINT_NUM * 3;
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+data::BoxArray YoloModelImpl::forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream = nullptr)
			
 
				+{
			
 
				+    slice_->slice(image, slice_width, slice_height, overlap_width_ratio, overlap_height_ratio, stream);
			
 
				+    return forwards(stream);
			
 
				+}
			
 
				+
			
 
				+data::BoxArray YoloModelImpl::forward(const tensor::Image &image, void *stream = nullptr)
			
 
				+{
			
 
				+    slice_->autoSlice(image, stream);
			
 
				+    return forwards(stream);
			
 
				+}
			
 
				+
			
 
				+data::BoxArray YoloModelImpl::forwards(void *stream = nullptr)
			
 
				+{
			
 
				+    int num_image = slice_->slice_num_h_ * slice_->slice_num_v_;
			
 
				+    if (num_image == 0) return {};
			
 
				+    
			
 
				+    auto input_dims = trt_->static_dims(0);
			
 
				+    int infer_batch_size = input_dims[0];
			
 
				+    if (infer_batch_size != num_image) 
			
 
				+    {
			
 
				+        if (isdynamic_model_) 
			
 
				+        {
			
 
				+            infer_batch_size = num_image;
			
 
				+            input_dims[0] = num_image;
			
 
				+            if (!trt_->set_run_dims(0, input_dims)) 
			
 
				+            {
			
 
				+                printf("Fail to set run dims\n");
			
 
				+                return {};
			
 
				+            }
			
 
				+        } 
			
 
				+        else 
			
 
				+        {
			
 
				+            if (infer_batch_size < num_image) 
			
 
				+            {
			
 
				+                printf(
			
 
				+                    "When using static shape model, number of images[%d] must be "
			
 
				+                    "less than or equal to the maximum batch[%d].",
			
 
				+                    num_image, infer_batch_size);
			
 
				+                return {};
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    adjust_memory(infer_batch_size);
			
 
				+
			
 
				+    affine::LetterBoxMatrix affine_matrix;
			
 
				+    cudaStream_t stream_ = (cudaStream_t)stream;
			
 
				+    for (int i = 0; i < num_image; ++i)
			
 
				+        preprocess(i, affine_matrix, stream);
			
 
				+
			
 
				+    float *bbox_output_device = bbox_predict_.gpu();
			
 
				+    #ifdef TRT10
			
 
				+    if (!trt_->forward(std::unordered_map<std::string, const void *>{
			
 
				+            { "images", input_buffer_.gpu() }, 
			
 
				+            { "output0", bbox_predict_.gpu() }
			
 
				+        }, stream_))
			
 
				+    {
			
 
				+        printf("Failed to tensorRT forward.");
			
 
				+        return {};
			
 
				+    }
			
 
				+    #else
			
 
				+    std::vector<void *> bindings{input_buffer_.gpu(), bbox_output_device};
			
 
				+    if (!trt_->forward(bindings, stream)) 
			
 
				+    {
			
 
				+        printf("Failed to tensorRT forward.");
			
 
				+        return {};
			
 
				+    }
			
 
				+    #endif
			
 
				+
			
 
				+    int* box_count = box_count_.gpu();
			
 
				+    checkRuntime(cudaMemsetAsync(box_count, 0, sizeof(int), stream_));
			
 
				+    for (int ib = 0; ib < num_image; ++ib) 
			
 
				+    {
			
 
				+        int start_x = slice_->slice_start_point_.cpu()[ib*2];
			
 
				+        int start_y = slice_->slice_start_point_.cpu()[ib*2+1];
			
 
				+        // float *boxarray_device =
			
 
				+        //     output_boxarray_.gpu() + ib * (MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
			
 
				+        float *boxarray_device = output_boxarray_.gpu();
			
 
				+        float *affine_matrix_device = affine_matrix_.gpu();
			
 
				+        float *image_based_bbox_output =
			
 
				+            bbox_output_device + ib * (bbox_head_dims_[1] * bbox_head_dims_[2]);
			
 
				+        if (model_type_ == ModelType::YOLOV5)
			
 
				+        {
			
 
				+            decode_kernel_invoker_v5(image_based_bbox_output, bbox_head_dims_[1], num_classes_,
			
 
				+                                bbox_head_dims_[2], confidence_threshold_, nms_threshold_,
			
 
				+                                affine_matrix_device, boxarray_device, box_count, MAX_IMAGE_BOXES, start_x, start_y, stream_);
			
 
				+        }
			
 
				+        else if (model_type_ == ModelType::YOLOV8 || model_type_ == ModelType::YOLOV11)
			
 
				+        {
			
 
				+            decode_kernel_invoker_v8(image_based_bbox_output, bbox_head_dims_[1], num_classes_,
			
 
				+                                bbox_head_dims_[2], confidence_threshold_, nms_threshold_,
			
 
				+                                affine_matrix_device, boxarray_device, box_count, MAX_IMAGE_BOXES, start_x, start_y, stream_);
			
 
				+        }
			
 
				+        else if (model_type_ == ModelType::YOLOV11POSE)
			
 
				+        {
			
 
				+            decode_kernel_invoker_v11pose(image_based_bbox_output, bbox_head_dims_[1], num_classes_,
			
 
				+                bbox_head_dims_[2], confidence_threshold_, nms_threshold_,
			
 
				+                affine_matrix_device, boxarray_device, box_count, MAX_IMAGE_BOXES, start_x, start_y, stream_);
			
 
				+        }
			
 
				+        
			
 
				+    }
			
 
				+    float *boxarray_device =  output_boxarray_.gpu();
			
 
				+    if (model_type_ == ModelType::YOLOV11POSE)
			
 
				+    {
			
 
				+        fast_nms_pose_kernel_invoker(boxarray_device, box_count, MAX_IMAGE_BOXES, nms_threshold_, stream_);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        fast_nms_kernel_invoker(boxarray_device, box_count, MAX_IMAGE_BOXES, nms_threshold_, stream_);
			
 
				+    }
			
 
				+    
			
 
				+    checkRuntime(cudaMemcpyAsync(output_boxarray_.cpu(), output_boxarray_.gpu(),
			
 
				+                                output_boxarray_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
			
 
				+    checkRuntime(cudaMemcpyAsync(box_count_.cpu(), box_count_.gpu(),
			
 
				+                                box_count_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
			
 
				+    checkRuntime(cudaStreamSynchronize(stream_));
			
 
				+
			
 
				+    data::BoxArray result;
			
 
				+    // int imemory = 0;
			
 
				+    float *parray = output_boxarray_.cpu();
			
 
				+    int count = min(MAX_IMAGE_BOXES, *(box_count_.cpu()));
			
 
				+
			
 
				+    for (int i = 0; i < count; ++i) 
			
 
				+    {
			
 
				+        int box_element = (model_type_ == ModelType::YOLOV11POSE) ? (NUM_BOX_ELEMENT + KEY_POINT_NUM * 3) : NUM_BOX_ELEMENT;
			
 
				+        float *pbox = parray + i * box_element;
			
 
				+        int label = pbox[5];
			
 
				+        int keepflag = pbox[6];
			
 
				+        if (keepflag == 1) 
			
 
				+        {
			
 
				+            data::Box result_object_box(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], label);
			
 
				+            if (model_type_ == ModelType::YOLOV11POSE)
			
 
				+            {
			
 
				+                result_object_box.pose.reserve(KEY_POINT_NUM);
			
 
				+                for (int i = 0; i < KEY_POINT_NUM; i++)
			
 
				+                {
			
 
				+                    result_object_box.pose.emplace_back(pbox[8+i*3], pbox[8+i*3+1], pbox[8+i*3+2]);
			
 
				+                }
			
 
				+            }
			
 
				+            
			
 
				+            result.emplace_back(result_object_box);
			
 
				+        }
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+Infer *loadraw(const std::string &engine_file, ModelType model_type, float confidence_threshold,
			
 
				+               float nms_threshold) 
			
 
				+{
			
 
				+    YoloModelImpl *impl = new YoloModelImpl();
			
 
				+    if (!impl->load(engine_file, yolo_type, confidence_threshold, nms_threshold)) 
			
 
				+    {
			
 
				+        delete impl;
			
 
				+        impl = nullptr;
			
 
				+    }
			
 
				+    impl->slice_ = std::make_shared<slice::SliceImage>();
			
 
				+    return impl;
			
 
				+}
			
 
				+
			
 
				+std::shared_ptr<Infer> load_yolo(const std::string &engine_file, ModelType model_type, int gpu_id, float confidence_threshold, float nms_threshold) 
			
 
				+{
			
 
				+    checkRuntime(cudaSetDevice(gpu_id));
			
 
				+    return std::shared_ptr<YoloModelImpl>((YoloModelImpl *)loadraw(engine_file, model_type, confidence_threshold, nms_threshold));
			
 
				+}
			
 
				+
			
 
				+}
			
--- a/src/infer/trt/yolo.hpp
+++ b/src/infer/trt/yolo.hpp
@@ -0,0 +1,74 @@
 
				+#ifndef YOLO_HPP__
			
 
				+#define YOLO_HPP__
			
 
				+#include <vector>
			
 
				+#include <iomanip>
			
 
				+#include "common/memory.hpp"
			
 
				+#include "infer/infer.hpp"
			
 
				+#include "common/image.hpp"
			
 
				+#include "common/data.hpp"
			
 
				+#include "infer/slice/slice.hpp"
			
 
				+#include "infer/trt/affine.hpp"
			
 
				+
			
 
				+#ifdef TRT10
			
 
				+#include "common/tensorrt.hpp"
			
 
				+namespace TensorRT = TensorRT10;
			
 
				+#else
			
 
				+#include "common/tensorrt8.hpp"
			
 
				+namespace TensorRT = TensorRT8;
			
 
				+#endif
			
 
				+
			
 
				+namespace yolo
			
 
				+{
			
 
				+
			
 
				+    class YoloModelImpl : public Infer 
			
 
				+    {
			
 
				+    public:
			
 
				+        ModelType model_type_;
			
 
				+    
			
 
				+        // for sahi crop image
			
 
				+        std::shared_ptr<slice::SliceImage> slice_;
			
 
				+        std::shared_ptr<TensorRT::Engine> trt_;
			
 
				+        std::string engine_file_;
			
 
				+    
			
 
				+        tensor::Memory<int> box_count_;
			
 
				+    
			
 
				+        tensor::Memory<float> affine_matrix_;
			
 
				+        tensor::Memory<float>  input_buffer_, bbox_predict_, output_boxarray_;
			
 
				+    
			
 
				+        int network_input_width_, network_input_height_;
			
 
				+        affine::Norm normalize_;
			
 
				+        std::vector<int> bbox_head_dims_;
			
 
				+        bool isdynamic_model_ = false;
			
 
				+    
			
 
				+        float confidence_threshold_;
			
 
				+        float nms_threshold_;
			
 
				+    
			
 
				+        int num_classes_ = 0;
			
 
				+
			
 
				+        YoloModelImpl() = default;
			
 
				+    
			
 
				+        virtual ~YoloModelImpl() = default;
			
 
				+    
			
 
				+        void adjust_memory(int batch_size);
			
 
				+    
			
 
				+        void preprocess(int ibatch, affine::LetterBoxMatrix &affine, void *stream = nullptr);
			
 
				+        
			
 
				+    
			
 
				+        bool load(const std::string &engine_file, ModelType model_type, float confidence_threshold, float nms_threshold);
			
 
				+    
			
 
				+        virtual data::BoxArray forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream = nullptr) override;
			
 
				+    
			
 
				+        virtual data::BoxArray forward(const tensor::Image &image, void *stream = nullptr) override;
			
 
				+    
			
 
				+        virtual data::BoxArray forwards(void *stream = nullptr) override;
			
 
				+};
			
 
				+
			
 
				+Infer *loadraw(const std::string &engine_file, ModelType model_type, float confidence_threshold,
			
 
				+    float nms_threshold);
			
 
				+
			
 
				+std::shared_ptr<Infer> load_yolo(const std::string &engine_file, ModelType model_type, int gpu_id, float confidence_threshold, float nms_threshold);
			
 
				+
			
 
				+} // namespace yolo
			
 
				+
			
 
				+#endif // YOLO_HPP__
			
 
				+
			
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -8,8 +8,8 @@
 
				 
			
 
				 int main()
			
 
				 {
			
 
				-    std::shared_ptr<Node::StreamNode> src_node   = std::make_shared<Node::StreamNode>("src", "rtsp://admin:lww123456@172.16.22.16:554/Streaming/Channels/101");
			
 
				-    src_node->set_skip_frame(10);
			
 
				+    std::shared_ptr<Node::StreamNode> src_node0   = std::make_shared<Node::StreamNode>("src0", "rtsp://admin:lww123456@172.16.22.16:554/Streaming/Channels/101");
			
 
				+    src_node0->set_skip_frame(10);
			
 
				 
			
 
				     std::shared_ptr<Node::StreamNode> src_node1   = std::make_shared<Node::StreamNode>("src1", "rtsp://admin:lww123456@172.16.22.16:554/Streaming/Channels/201");
			
 
				     src_node1->set_skip_frame(10);
			
@@ -21,7 +21,7 @@ int main()
 
				     std::shared_ptr<Node::InferNode> infer_node   = std::make_shared<Node::InferNode>("infer");
			
 
				     std::shared_ptr<Node::DrawNode> draw_node     = std::make_shared<Node::DrawNode>("draw");
			
 
				     std::shared_ptr<Node::HttpPushNode> push_node = std::make_shared<Node::HttpPushNode>("push", "172.16.20.168", 8080, "/push");
			
 
				-    Node::LinkNode(src_node, infer_node);
			
 
				+    Node::LinkNode(src_node0, infer_node);
			
 
				     Node::LinkNode(src_node1, infer_node);
			
 
				     Node::LinkNode(src_node2, infer_node);
			
 
				     Node::LinkNode(infer_node, draw_node);
			
@@ -29,7 +29,7 @@ int main()
 
				     push_node->start();
			
 
				     draw_node->start();
			
 
				     infer_node->start();
			
 
				-    src_node->start();
			
 
				+    src_node0->start();
			
 
				     src_node1->start();
			
 
				     src_node2->start();
			
 
				     getchar();
			
--- a/src/nodes/infer/inferNode.hpp
+++ b/src/nodes/infer/inferNode.hpp
@@ -3,7 +3,7 @@
 
				 
			
 
				 #include "nodes/base/base.hpp"
			
 
				 #include <opencv2/opencv.hpp>
			
 
				-
			
 
				+#include "infer/infer.hpp"
			
 
				 namespace Node
			
 
				 {
			
 
				 
			
@@ -12,18 +12,17 @@ class InferNode : public BaseNode
 
				 public:
			
 
				     InferNode() = delete;
			
 
				     InferNode(const std::string& name) : BaseNode(name, NODE_TYPE::MID_NODE) {}
			
 
				-    InferNode(const std::string& name, const std::string& model_path) : BaseNode(name, NODE_TYPE::MID_NODE), model_path_(model_path){}
			
 
				     virtual ~InferNode() { };
			
 
				 
			
 
				-    void set_model_path(const std::string& model_path)
			
 
				+    void set_model_instance(std::shared_ptr<Infer> model)
			
 
				     {
			
 
				-        model_path_ = model_path;
			
 
				+        model_ = infer;
			
 
				     }
			
 
				 
			
 
				     void work() override;
			
 
				 
			
 
				 private:
			
 
				-    std::string model_path_;
			
 
				+    std::shared_ptr<Infer> model_ = nullptr;
			
 
				 };
			
 
				 
			
 
				 }
			
--- a/src/pipeline/pipiline.hpp
+++ b/src/pipeline/pipiline.hpp
@@ -1,29 +0,0 @@
 
				-#ifndef PIPELINE_HPP__
			
 
				-#define PIPELINE_HPP__
			
 
				-
			
 
				-#include <string>
			
 
				-
			
 
				-
			
 
				-namespace pip
			
 
				-{
			
 
				-
			
 
				-class Pipeline
			
 
				-{
			
 
				-public:
			
 
				-    Pipeline() = delete;
			
 
				-    Pipeline(const std::string& name) : name_(name) {}
			
 
				-
			
 
				-    void start();
			
 
				-    void stop();
			
 
				-
			
 
				-private:
			
 
				-    std::string name_;
			
 
				-    bool running_ = false;
			
 
				-};
			
 
				-
			
 
				-
			
 
				-void create_pipeline();
			
 
				-
			
 
				-}
			
 
				-
			
 
				-#endif // PIPELINE_HPP__
			
--- a/src/stream/stream.hpp
+++ b/src/stream/stream.hpp
@@ -0,0 +1,5 @@
 
				+#ifndef STREAM_HPP__
			
 
				+#define STREAM_HPP__
			
 
				+
			
 
				+
			
 
				+#endif  // STREAM_HPP__