diff --git a/models/cv/detection/yolov8/ixrt/README.md b/models/cv/detection/yolov8/ixrt/README.md index 07558edf6f3591a70262c778309d67484d1edf4f..96d637e0d9bfd485d169ddaf7c55d26eddeacbf3 100644 --- a/models/cv/detection/yolov8/ixrt/README.md +++ b/models/cv/detection/yolov8/ixrt/README.md @@ -20,7 +20,7 @@ pip3 install onnx pip3 install onnxsim pip3 install pycocotools pip3 install ultralytics -pip3 install pycuda +pip3 install cuda-python ``` ### Download diff --git a/models/cv/detection/yolov8/ixrt/common.py b/models/cv/detection/yolov8/ixrt/common.py index dc3c2766533fa5a334a61231adb168ecf09622c3..3f28ccbc8830f5ac14703504829658f128fb0ce3 100644 --- a/models/cv/detection/yolov8/ixrt/common.py +++ b/models/cv/detection/yolov8/ixrt/common.py @@ -20,7 +20,7 @@ import numpy as np from tqdm import tqdm import tensorrt -import pycuda.driver as cuda +from cuda import cuda, cudart def load_class_names(namesfile): @@ -101,13 +101,15 @@ def setup_io_bindings(engine, context): size = np.dtype(tensorrt.nptype(dtype)).itemsize for s in shape: size *= s - allocation = cuda.mem_alloc(size) + err, allocation = cudart.cudaMalloc(size) + assert err == cudart.cudaError_t.cudaSuccess binding = { "index": i, "name": name, "dtype": np.dtype(tensorrt.nptype(dtype)), "shape": list(shape), "allocation": allocation, + "nbytes": size } # print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}") allocations.append(allocation) diff --git a/models/cv/detection/yolov8/ixrt/inference.py b/models/cv/detection/yolov8/ixrt/inference.py index d83b013610c132a776a2dc02663177e20a7ea2e3..9abc214234e2316e9363e5175886f553bb182eee 100644 --- a/models/cv/detection/yolov8/ixrt/inference.py +++ b/models/cv/detection/yolov8/ixrt/inference.py @@ -19,8 +19,7 @@ import argparse import time import tensorrt from tensorrt import Dims -import pycuda.autoinit -import pycuda.driver as cuda +from cuda import cuda, cudart import torch import numpy as np from tqdm import tqdm @@ -157,7 +156,8 @@ class IxRT_Validator(DetectionValidator): context.set_binding_shape(input_idx, Dims(data_shape)) inputs, outputs, allocations = setup_io_bindings(engine, context) - cuda.memcpy_htod(inputs[0]["allocation"], batch_data) + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) # Prepare the output data output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) @@ -167,7 +167,15 @@ class IxRT_Validator(DetectionValidator): end_time = time.time() forward_time += end_time - start_time - cuda.memcpy_dtoh(output, outputs[0]["allocation"]) + err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + + for alloc in allocations: + if not alloc: + continue + (err,) = cudart.cudaFree(alloc) + assert err == cudart.cudaError_t.cudaSuccess + if pad_batch: output = output[:origin_size] @@ -176,7 +184,7 @@ class IxRT_Validator(DetectionValidator): preds = self.postprocess([outputs]) self.update_metrics(preds, batch) - + if config.perf_only: fps = num_samples / forward_time return fps