depth.cu 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. #include "infer/trt/depth/depth.hpp"
  2. #include <vector>
  3. #include <memory>
  4. #include "infer/trt/affine.hpp"
  5. #include "common/check.hpp"
  6. namespace depth
  7. {
  8. bool DepthModelImpl::load(const std::string &engine_file)
  9. {
  10. trt_ = TensorRT::load(engine_file);
  11. if (trt_ == nullptr) return false;
  12. trt_->print();
  13. auto input_dim = trt_->static_dims(0);
  14. network_input_width_ = input_dim[3];
  15. network_input_height_ = input_dim[2];
  16. isdynamic_model_ = trt_->has_dynamic_dim();
  17. normalize_ = affine::Norm::alpha_beta(1 / 255.0f, 0.0f, affine::ChannelType::SwapRB);
  18. return true;
  19. }
  20. void DepthModelImpl::preprocess(const tensor::Image &image, affine::LetterBoxMatrix &affine, void *stream)
  21. {
  22. affine.compute(std::make_tuple(image.width, image.height),
  23. std::make_tuple(network_input_width_, network_input_height_));
  24. float *input_device = input_buffer_.gpu();
  25. size_t size_image = image.width * image.height * 3;
  26. preprocess_buffer_.gpu(size_image);
  27. preprocess_buffer_.cpu(size_image);
  28. uint8_t *image_device = preprocess_buffer_.gpu();
  29. uint8_t *image_host = preprocess_buffer_.cpu();
  30. float *affine_matrix_device = affine_matrix_.gpu();
  31. float *affine_matrix_host = affine_matrix_.cpu();
  32. // speed up
  33. cudaStream_t stream_ = (cudaStream_t)stream;
  34. memcpy(image_host, image.bgrptr, size_image);
  35. checkRuntime(
  36. cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
  37. memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
  38. checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
  39. cudaMemcpyHostToDevice, stream_));
  40. affine::warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width,
  41. image.height, input_device, network_input_width_,
  42. network_input_height_, affine_matrix_device, 114,
  43. normalize_, stream_);
  44. }
  45. void DepthModelImpl::postprocess(int width, int height, void *stream)
  46. {
  47. adjust_memory(width, height);
  48. cudaStream_t stream_ = (cudaStream_t)stream;
  49. float *affine_matrix_device = affine_matrix_.gpu();
  50. float *image_device = output_buffer_.gpu();
  51. float *dst_device = depth_map_buffer_.gpu();
  52. affine::warp_affine_bilinear_single_channel_plane(
  53. image_device, network_input_width_, network_input_width_, network_input_height_,
  54. dst_device, width, height, affine_matrix_device, 1000,
  55. stream_);
  56. }
  57. Result DepthModelImpl::forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream)
  58. {
  59. return cv::Mat();
  60. }
  61. Result DepthModelImpl::forward(const tensor::Image &image, void *stream)
  62. {
  63. int num_image = 1;
  64. if (num_image == 0) return {};
  65. auto input_dims = trt_->static_dims(0);
  66. int infer_batch_size = input_dims[0];
  67. if (infer_batch_size != num_image)
  68. {
  69. if (isdynamic_model_)
  70. {
  71. infer_batch_size = num_image;
  72. input_dims[0] = num_image;
  73. if (!trt_->set_run_dims(0, input_dims))
  74. {
  75. printf("Fail to set run dims\n");
  76. return {};
  77. }
  78. }
  79. else
  80. {
  81. if (infer_batch_size < num_image)
  82. {
  83. printf(
  84. "When using static shape model, number of images[%d] must be "
  85. "less than or equal to the maximum batch[%d].",
  86. num_image, infer_batch_size);
  87. return {};
  88. }
  89. }
  90. }
  91. adjust_memory();
  92. affine::LetterBoxMatrix affine_matrix;
  93. cudaStream_t stream_ = (cudaStream_t)stream;
  94. preprocess(image, affine_matrix, stream);
  95. #ifdef TRT10
  96. if (!trt_->forward(std::unordered_map<std::string, const void *>{
  97. { "input", input_buffer_.gpu() },
  98. { "output", output_buffer_.gpu() }
  99. }, stream_))
  100. {
  101. printf("Failed to tensorRT forward.\n");
  102. return cv::Mat();
  103. }
  104. #else
  105. std::vector<void *> bindings{input_buffer_.gpu(), output_buffer_.gpu()};
  106. if (!trt_->forward(bindings, stream))
  107. {
  108. printf("Failed to tensorRT forward.");
  109. return cv::Mat();
  110. }
  111. #endif
  112. adjust_memory(image.width, image.height);
  113. postprocess(image.width, image.height, stream);
  114. checkRuntime(cudaMemcpyAsync(depth_map_buffer_.cpu(), depth_map_buffer_.gpu(),
  115. depth_map_buffer_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
  116. checkRuntime(cudaStreamSynchronize(stream_));
  117. cv::Mat depth_mat(image.height, image.width, CV_32FC1, depth_map_buffer_.cpu());
  118. return depth_mat;
  119. }
  120. Infer *loadraw(const std::string &engine_file)
  121. {
  122. DepthModelImpl *impl = new DepthModelImpl();
  123. if (!impl->load(engine_file))
  124. {
  125. delete impl;
  126. impl = nullptr;
  127. }
  128. return impl;
  129. }
  130. std::shared_ptr<Infer> load_depth(const std::string &engine_file, int gpu_id)
  131. {
  132. checkRuntime(cudaSetDevice(gpu_id));
  133. return std::shared_ptr<DepthModelImpl>((DepthModelImpl *)loadraw(engine_file));
  134. }
  135. } // namespace depth