diff --git a/models/cv/detection/yolov8/ixrt/README.md b/models/cv/detection/yolov8/ixrt/README.md
index 07558edf6f3591a70262c778309d67484d1edf4f..96d637e0d9bfd485d169ddaf7c55d26eddeacbf3 100644
--- a/models/cv/detection/yolov8/ixrt/README.md
+++ b/models/cv/detection/yolov8/ixrt/README.md
@@ -20,7 +20,7 @@ pip3 install onnx
 pip3 install onnxsim
 pip3 install pycocotools
 pip3 install ultralytics
-pip3 install pycuda
+pip3 install cuda-python
 ```
 
 ### Download
diff --git a/models/cv/detection/yolov8/ixrt/common.py b/models/cv/detection/yolov8/ixrt/common.py
index dc3c2766533fa5a334a61231adb168ecf09622c3..3f28ccbc8830f5ac14703504829658f128fb0ce3 100644
--- a/models/cv/detection/yolov8/ixrt/common.py
+++ b/models/cv/detection/yolov8/ixrt/common.py
@@ -20,7 +20,7 @@ import numpy as np
 from tqdm import tqdm
 
 import tensorrt
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 
 
 def load_class_names(namesfile):
@@ -101,13 +101,15 @@ def setup_io_bindings(engine, context):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size
         }
         # print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
diff --git a/models/cv/detection/yolov8/ixrt/inference.py b/models/cv/detection/yolov8/ixrt/inference.py
index d83b013610c132a776a2dc02663177e20a7ea2e3..9abc214234e2316e9363e5175886f553bb182eee 100644
--- a/models/cv/detection/yolov8/ixrt/inference.py
+++ b/models/cv/detection/yolov8/ixrt/inference.py
@@ -19,8 +19,7 @@ import argparse
 import time
 import tensorrt
 from tensorrt import Dims
-import pycuda.autoinit
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 import torch
 import numpy as np
 from tqdm import tqdm
@@ -157,7 +156,8 @@ class IxRT_Validator(DetectionValidator):
             context.set_binding_shape(input_idx, Dims(data_shape))
             inputs, outputs, allocations = setup_io_bindings(engine, context)
 
-            cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+            err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
             # Prepare the output data
             output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
             
@@ -167,7 +167,15 @@ class IxRT_Validator(DetectionValidator):
             end_time = time.time()
             forward_time += end_time - start_time
             
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+
+            for alloc in allocations:
+                if not alloc:
+                    continue
+                (err,) = cudart.cudaFree(alloc)
+                assert err == cudart.cudaError_t.cudaSuccess   
+                
             if pad_batch:
                 output = output[:origin_size]
                 
@@ -176,7 +184,7 @@ class IxRT_Validator(DetectionValidator):
             preds = self.postprocess([outputs])
             
             self.update_metrics(preds, batch)
-                
+
         if config.perf_only:
             fps = num_samples / forward_time
             return fps