depth.cu 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #include "infer/trt/depth/depth.hpp"
  2. #include <vector>
  3. #include <memory>
  4. #include "infer/trt/affine.hpp"
  5. #include "common/check.hpp"
  6. namespace depth
  7. {
  8. bool DepthModelImpl::load(const std::string &engine_file)
  9. {
  10. trt_ = TensorRT::load(engine_file);
  11. if (trt_ == nullptr) return false;
  12. trt_->print();
  13. auto input_dim = trt_->static_dims(0);
  14. network_input_width_ = input_dim[3];
  15. network_input_height_ = input_dim[2];
  16. isdynamic_model_ = trt_->has_dynamic_dim();
  17. normalize_ = affine::Norm::alpha_beta(1 / 255.0f, 0.0f, affine::ChannelType::SwapRB);
  18. return true;
  19. }
  20. void DepthModelImpl::preprocess(const tensor::Image &image, affine::LetterBoxMatrix &affine, void *stream)
  21. {
  22. affine.compute(std::make_tuple(image.width, image.height),
  23. std::make_tuple(network_input_width_, network_input_height_));
  24. size_t input_numel = network_input_width_ * network_input_height_ * 3;
  25. float *input_device = input_buffer_.gpu();
  26. size_t size_image = image.width * image.height * 3;
  27. preprocess_buffer_.gpu(size_image);
  28. preprocess_buffer_.cpu(size_image);
  29. uint8_t *image_device = preprocess_buffer_.gpu();
  30. uint8_t *image_host = preprocess_buffer_.cpu();
  31. float *affine_matrix_device = affine_matrix_.gpu();
  32. float *affine_matrix_host = affine_matrix_.cpu();
  33. // speed up
  34. cudaStream_t stream_ = (cudaStream_t)stream;
  35. memcpy(image_host, image.bgrptr, size_image);
  36. checkRuntime(
  37. cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
  38. memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
  39. checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
  40. cudaMemcpyHostToDevice, stream_));
  41. affine::warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width,
  42. image.height, input_device, network_input_width_,
  43. network_input_height_, affine_matrix_device, 114,
  44. normalize_, stream_);
  45. }
  46. void DepthModelImpl::postprocess(int width, int height, void *stream)
  47. {
  48. adjust_memory(width, height);
  49. cudaStream_t stream_ = (cudaStream_t)stream;
  50. float *affine_matrix_device = affine_matrix_.gpu();
  51. float *image_device = output_buffer_.gpu();
  52. float *dst_device = depth_map_buffer_.gpu();
  53. affine::warp_affine_bilinear_single_channel_plane(
  54. image_device, network_input_width_, network_input_width_, network_input_height_,
  55. dst_device, width, height, affine_matrix_device, 1000,
  56. stream_);
  57. }
  58. Result DepthModelImpl::forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream)
  59. {
  60. return cv::Mat();
  61. }
  62. Result DepthModelImpl::forward(const tensor::Image &image, void *stream)
  63. {
  64. int num_image = 1;
  65. if (num_image == 0) return {};
  66. auto input_dims = trt_->static_dims(0);
  67. int infer_batch_size = input_dims[0];
  68. if (infer_batch_size != num_image)
  69. {
  70. if (isdynamic_model_)
  71. {
  72. infer_batch_size = num_image;
  73. input_dims[0] = num_image;
  74. if (!trt_->set_run_dims(0, input_dims))
  75. {
  76. printf("Fail to set run dims\n");
  77. return {};
  78. }
  79. }
  80. else
  81. {
  82. if (infer_batch_size < num_image)
  83. {
  84. printf(
  85. "When using static shape model, number of images[%d] must be "
  86. "less than or equal to the maximum batch[%d].",
  87. num_image, infer_batch_size);
  88. return {};
  89. }
  90. }
  91. }
  92. adjust_memory();
  93. affine::LetterBoxMatrix affine_matrix;
  94. cudaStream_t stream_ = (cudaStream_t)stream;
  95. preprocess(image, affine_matrix, stream);
  96. #ifdef TRT10
  97. if (!trt_->forward(std::unordered_map<std::string, const void *>{
  98. { "input", input_buffer_.gpu() },
  99. { "output", depth_map_buffer_.gpu() }
  100. }, stream_))
  101. {
  102. printf("Failed to tensorRT forward.");
  103. return {};
  104. }
  105. #else
  106. std::vector<void *> bindings{input_buffer_.gpu(), depth_map_buffer_.gpu()};
  107. if (!trt_->forward(bindings, stream))
  108. {
  109. printf("Failed to tensorRT forward.");
  110. return cv::Mat();
  111. }
  112. #endif
  113. postprocess(image.width, image.height, stream);
  114. checkRuntime(cudaMemcpyAsync(depth_map_buffer_.cpu(), depth_map_buffer_.gpu(),
  115. depth_map_buffer_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
  116. checkRuntime(cudaStreamSynchronize(stream_));
  117. return cv::Mat();
  118. }
  119. Infer *loadraw(const std::string &engine_file)
  120. {
  121. DepthModelImpl *impl = new DepthModelImpl();
  122. if (!impl->load(engine_file))
  123. {
  124. delete impl;
  125. impl = nullptr;
  126. }
  127. return impl;
  128. }
  129. std::shared_ptr<Infer> load_depth(const std::string &engine_file, int gpu_id)
  130. {
  131. checkRuntime(cudaSetDevice(gpu_id));
  132. return std::shared_ptr<DepthModelImpl>((DepthModelImpl *)loadraw(engine_file));
  133. }
  134. } // namespace depth