leon
/
videoStreamInfer


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
							#include "infer/trt/depth/depth.hpp"
#include <vector>
#include <memory>
#include "infer/trt/affine.hpp"
#include "common/check.hpp"


namespace depth
{


bool DepthModelImpl::load(const std::string &engine_file)
{
    trt_ = TensorRT::load(engine_file);
    if (trt_ == nullptr) return false;

    trt_->print();

    auto input_dim = trt_->static_dims(0);
    network_input_width_ = input_dim[3];
    network_input_height_ = input_dim[2];
    isdynamic_model_ = trt_->has_dynamic_dim();

    normalize_ = affine::Norm::alpha_beta(1 / 255.0f, 0.0f, affine::ChannelType::SwapRB);
    return true;
}

void DepthModelImpl::preprocess(const tensor::Image &image, affine::LetterBoxMatrix &affine, void *stream)
{
    affine.compute(std::make_tuple(image.width, image.height),
                std::make_tuple(network_input_width_, network_input_height_));

    size_t input_numel = network_input_width_ * network_input_height_ * 3;
    float *input_device = input_buffer_.gpu();
    size_t size_image = image.width * image.height * 3;
    
    preprocess_buffer_.gpu(size_image);
    preprocess_buffer_.cpu(size_image);
   
    uint8_t *image_device = preprocess_buffer_.gpu();
    uint8_t *image_host   = preprocess_buffer_.cpu();

    float *affine_matrix_device = affine_matrix_.gpu();
    float *affine_matrix_host = affine_matrix_.cpu();

    // speed up
    cudaStream_t stream_ = (cudaStream_t)stream;
    memcpy(image_host, image.bgrptr, size_image);
    checkRuntime(
        cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));

    memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
    checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
                                cudaMemcpyHostToDevice, stream_));

    affine::warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width,
                                            image.height, input_device, network_input_width_,
                                            network_input_height_, affine_matrix_device, 114,
                                            normalize_, stream_);
}


void DepthModelImpl::postprocess(int width, int height, void *stream)
{
    adjust_memory(width, height);
    
    cudaStream_t stream_ = (cudaStream_t)stream;
    float *affine_matrix_device = affine_matrix_.gpu();

    float *image_device = output_buffer_.gpu();
    float *dst_device = depth_map_buffer_.gpu();

    affine::warp_affine_bilinear_single_channel_plane(
        image_device, network_input_width_, network_input_width_, network_input_height_,
        dst_device, width, height, affine_matrix_device, 1000,
        stream_);
}

Result DepthModelImpl::forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream)
{
    return cv::Mat();
}

Result DepthModelImpl::forward(const tensor::Image &image, void *stream)
{
    int num_image = 1;
    if (num_image == 0) return {};
    
    auto input_dims = trt_->static_dims(0);
    int infer_batch_size = input_dims[0];
    if (infer_batch_size != num_image) 
    {
        if (isdynamic_model_) 
        {
            infer_batch_size = num_image;
            input_dims[0] = num_image;
            if (!trt_->set_run_dims(0, input_dims)) 
            {
                printf("Fail to set run dims\n");
                return {};
            }
        } 
        else 
        {
            if (infer_batch_size < num_image) 
            {
                printf(
                    "When using static shape model, number of images[%d] must be "
                    "less than or equal to the maximum batch[%d].",
                    num_image, infer_batch_size);
                return {};
            }
        }
    }
    adjust_memory();
    affine::LetterBoxMatrix affine_matrix;
    cudaStream_t stream_ = (cudaStream_t)stream;
    preprocess(image, affine_matrix, stream);

    #ifdef TRT10
    if (!trt_->forward(std::unordered_map<std::string, const void *>{
            { "input", input_buffer_.gpu() }, 
            { "output", depth_map_buffer_.gpu() }
        }, stream_))
    {
        printf("Failed to tensorRT forward.");
        return {};
    }
    #else
    std::vector<void *> bindings{input_buffer_.gpu(), depth_map_buffer_.gpu()};
    if (!trt_->forward(bindings, stream)) 
    {
        printf("Failed to tensorRT forward.");
        return cv::Mat();
    }
    #endif

    postprocess(image.width, image.height, stream);

    checkRuntime(cudaMemcpyAsync(depth_map_buffer_.cpu(), depth_map_buffer_.gpu(),
                                    depth_map_buffer_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
    checkRuntime(cudaStreamSynchronize(stream_));
    return cv::Mat();
}


Infer *loadraw(const std::string &engine_file) 
{
    DepthModelImpl *impl = new DepthModelImpl();
    if (!impl->load(engine_file)) 
    {
        delete impl;
        impl = nullptr;
    }
    return impl;
}

std::shared_ptr<Infer> load_depth(const std::string &engine_file, int gpu_id) 
{
    checkRuntime(cudaSetDevice(gpu_id));
    return std::shared_ptr<DepthModelImpl>((DepthModelImpl *)loadraw(engine_file));
}


}   // namespace depth