#include "infer/trt/depth/depth.hpp" #include #include #include "infer/trt/affine.hpp" #include "common/check.hpp" namespace depth { bool DepthModelImpl::load(const std::string &engine_file) { trt_ = TensorRT::load(engine_file); if (trt_ == nullptr) return false; trt_->print(); auto input_dim = trt_->static_dims(0); network_input_width_ = input_dim[3]; network_input_height_ = input_dim[2]; isdynamic_model_ = trt_->has_dynamic_dim(); normalize_ = affine::Norm::alpha_beta(1 / 255.0f, 0.0f, affine::ChannelType::SwapRB); return true; } void DepthModelImpl::preprocess(const tensor::Image &image, affine::LetterBoxMatrix &affine, void *stream) { affine.compute(std::make_tuple(image.width, image.height), std::make_tuple(network_input_width_, network_input_height_)); float *input_device = input_buffer_.gpu(); size_t size_image = image.width * image.height * 3; preprocess_buffer_.gpu(size_image); preprocess_buffer_.cpu(size_image); uint8_t *image_device = preprocess_buffer_.gpu(); uint8_t *image_host = preprocess_buffer_.cpu(); float *affine_matrix_device = affine_matrix_.gpu(); float *affine_matrix_host = affine_matrix_.cpu(); // speed up cudaStream_t stream_ = (cudaStream_t)stream; memcpy(image_host, image.bgrptr, size_image); checkRuntime( cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_)); memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i)); checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i), cudaMemcpyHostToDevice, stream_)); affine::warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width, image.height, input_device, network_input_width_, network_input_height_, affine_matrix_device, 114, normalize_, stream_); } void DepthModelImpl::postprocess(int width, int height, void *stream) { adjust_memory(width, height); cudaStream_t stream_ = (cudaStream_t)stream; float *affine_matrix_device = affine_matrix_.gpu(); float *image_device = output_buffer_.gpu(); float *dst_device = depth_map_buffer_.gpu(); affine::warp_affine_bilinear_single_channel_plane( image_device, network_input_width_, network_input_width_, network_input_height_, dst_device, width, height, affine_matrix_device, 1000, stream_); } Result DepthModelImpl::forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream) { return cv::Mat(); } Result DepthModelImpl::forward(const tensor::Image &image, void *stream) { int num_image = 1; if (num_image == 0) return {}; auto input_dims = trt_->static_dims(0); int infer_batch_size = input_dims[0]; if (infer_batch_size != num_image) { if (isdynamic_model_) { infer_batch_size = num_image; input_dims[0] = num_image; if (!trt_->set_run_dims(0, input_dims)) { printf("Fail to set run dims\n"); return {}; } } else { if (infer_batch_size < num_image) { printf( "When using static shape model, number of images[%d] must be " "less than or equal to the maximum batch[%d].", num_image, infer_batch_size); return {}; } } } adjust_memory(); affine::LetterBoxMatrix affine_matrix; cudaStream_t stream_ = (cudaStream_t)stream; preprocess(image, affine_matrix, stream); #ifdef TRT10 if (!trt_->forward(std::unordered_map{ { "input", input_buffer_.gpu() }, { "output", output_buffer_.gpu() } }, stream_)) { printf("Failed to tensorRT forward.\n"); return cv::Mat(); } #else std::vector bindings{input_buffer_.gpu(), output_buffer_.gpu()}; if (!trt_->forward(bindings, stream)) { printf("Failed to tensorRT forward."); return cv::Mat(); } #endif adjust_memory(image.width, image.height); postprocess(image.width, image.height, stream); checkRuntime(cudaMemcpyAsync(depth_map_buffer_.cpu(), depth_map_buffer_.gpu(), depth_map_buffer_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_)); checkRuntime(cudaStreamSynchronize(stream_)); cv::Mat depth_mat(image.height, image.width, CV_32FC1, depth_map_buffer_.cpu()); return depth_mat; } Infer *loadraw(const std::string &engine_file) { DepthModelImpl *impl = new DepthModelImpl(); if (!impl->load(engine_file)) { delete impl; impl = nullptr; } return impl; } std::shared_ptr load_depth(const std::string &engine_file, int gpu_id) { checkRuntime(cudaSetDevice(gpu_id)); return std::shared_ptr((DepthModelImpl *)loadraw(engine_file)); } } // namespace depth