|
@@ -17,7 +17,6 @@ bool DepthModelImpl::load(const std::string &engine_file, ModelType model_type,
|
|
trt_->print();
|
|
trt_->print();
|
|
|
|
|
|
auto input_dim = trt_->static_dims(0);
|
|
auto input_dim = trt_->static_dims(0);
|
|
- bbox_head_dims_ = trt_->static_dims(1);
|
|
|
|
network_input_width_ = input_dim[3];
|
|
network_input_width_ = input_dim[3];
|
|
network_input_height_ = input_dim[2];
|
|
network_input_height_ = input_dim[2];
|
|
isdynamic_model_ = trt_->has_dynamic_dim();
|
|
isdynamic_model_ = trt_->has_dynamic_dim();
|
|
@@ -26,5 +25,121 @@ bool DepthModelImpl::load(const std::string &engine_file, ModelType model_type,
|
|
return true;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+void DepthModelImpl::preprocess(const tensor::Image &image, affine::LetterBoxMatrix &affine, void *stream)
|
|
|
|
+{
|
|
|
|
+ affine.compute(std::make_tuple(image.width, image.height),
|
|
|
|
+ std::make_tuple(network_input_width_, network_input_height_));
|
|
|
|
+
|
|
|
|
+ size_t input_numel = network_input_width_ * network_input_height_ * 3;
|
|
|
|
+ float *input_device = input_buffer_.gpu();
|
|
|
|
+ size_t size_image = image.width * image.height * 3;
|
|
|
|
+
|
|
|
|
+ preprocess_buffer_.gpu(size_image);
|
|
|
|
+ preprocess_buffer_.cpu(size_image);
|
|
|
|
+
|
|
|
|
+ uint8_t *image_device = preprocess_buffer_.gpu();
|
|
|
|
+ uint8_t *image_host = preprocess_buffer_.cpu();
|
|
|
|
+
|
|
|
|
+ float *affine_matrix_device = affine_matrix_.gpu();
|
|
|
|
+ float *affine_matrix_host = affine_matrix_.cpu();
|
|
|
|
+
|
|
|
|
+ // speed up
|
|
|
|
+ cudaStream_t stream_ = (cudaStream_t)stream;
|
|
|
|
+ memcpy(image_host, image.bgrptr, size_image);
|
|
|
|
+ checkRuntime(
|
|
|
|
+ cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
|
|
|
|
+
|
|
|
|
+ memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
|
|
|
|
+ checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
|
|
|
|
+ cudaMemcpyHostToDevice, stream_));
|
|
|
|
+
|
|
|
|
+ affine::warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width,
|
|
|
|
+ image.height, input_device, network_input_width_,
|
|
|
|
+ network_input_height_, affine_matrix_device, 114,
|
|
|
|
+ normalize_, stream_);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+void DepthModelImpl::postprocess(int width, int height, void *stream)
|
|
|
|
+{
|
|
|
|
+ adjust_memory(width, height);
|
|
|
|
+
|
|
|
|
+ size_t size_matrix = sizeof(affine.d2i);
|
|
|
|
+
|
|
|
|
+ cudaStream_t stream_ = (cudaStream_t)stream;
|
|
|
|
+ float *affine_matrix_device = affine_matrix_.gpu();
|
|
|
|
+
|
|
|
|
+ float *image_device = output_buffer_.gpu();
|
|
|
|
+ float *dst_device = depth_map_buffer_.gpu();
|
|
|
|
+
|
|
|
|
+ affine::warp_affine_bilinear_single_channel_plane(
|
|
|
|
+ image_device, network_input_width_, network_input_width_, network_input_height_,
|
|
|
|
+ dst_device, width, height, affine_matrix_device, 1000,
|
|
|
|
+ stream_);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+cv::Mat DepthModelImpl::forward(const tensor::Image &image, void *stream)
|
|
|
|
+{
|
|
|
|
+ int num_image = 1;
|
|
|
|
+ if (num_image == 0) return {};
|
|
|
|
+
|
|
|
|
+ auto input_dims = trt_->static_dims(0);
|
|
|
|
+ int infer_batch_size = input_dims[0];
|
|
|
|
+ if (infer_batch_size != num_image)
|
|
|
|
+ {
|
|
|
|
+ if (isdynamic_model_)
|
|
|
|
+ {
|
|
|
|
+ infer_batch_size = num_image;
|
|
|
|
+ input_dims[0] = num_image;
|
|
|
|
+ if (!trt_->set_run_dims(0, input_dims))
|
|
|
|
+ {
|
|
|
|
+ printf("Fail to set run dims\n");
|
|
|
|
+ return {};
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ else
|
|
|
|
+ {
|
|
|
|
+ if (infer_batch_size < num_image)
|
|
|
|
+ {
|
|
|
|
+ printf(
|
|
|
|
+ "When using static shape model, number of images[%d] must be "
|
|
|
|
+ "less than or equal to the maximum batch[%d].",
|
|
|
|
+ num_image, infer_batch_size);
|
|
|
|
+ return {};
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ adjust_memory();
|
|
|
|
+ affine::LetterBoxMatrix affine_matrix;
|
|
|
|
+ cudaStream_t stream_ = (cudaStream_t)stream;
|
|
|
|
+ preprocess(image, affine_matrix, stream);
|
|
|
|
+
|
|
|
|
+ #ifdef TRT10
|
|
|
|
+ if (!trt_->forward(std::unordered_map<std::string, const void *>{
|
|
|
|
+ { "input", input_buffer_.gpu() },
|
|
|
|
+ { "output", bbox_predict_.gpu() }
|
|
|
|
+ }, stream_))
|
|
|
|
+ {
|
|
|
|
+ printf("Failed to tensorRT forward.");
|
|
|
|
+ return {};
|
|
|
|
+ }
|
|
|
|
+ #else
|
|
|
|
+ std::vector<void *> bindings{input_buffer_.gpu(), depth_map_buffer_.gpu()};
|
|
|
|
+ if (!trt_->forward(bindings, stream))
|
|
|
|
+ {
|
|
|
|
+ printf("Failed to tensorRT forward.");
|
|
|
|
+ return cv::Mat();
|
|
|
|
+ }
|
|
|
|
+ #endif
|
|
|
|
+
|
|
|
|
+ postprocess(image.width, image.height, stream);
|
|
|
|
+
|
|
|
|
+ checkRuntime(cudaMemcpyAsync(depth_map_buffer_.cpu(), depth_map_buffer_.gpu(),
|
|
|
|
+ depth_map_buffer_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
|
|
|
|
+ checkRuntime(cudaStreamSynchronize(stream_));
|
|
|
|
+ return cv::Mat();
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
|
|
} // namespace depth
|
|
} // namespace depth
|