diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml
index a2c3c146..6df69def 100644
--- a/.github/workflows/flake8.yml
+++ b/.github/workflows/flake8.yml
@@ -11,7 +11,7 @@ jobs:
     - name: Set up Python 3.9
       uses: actions/setup-python@v4
       with:
-        python-version: '3.9.13'
+        python-version: '3.9.19'
     - name: Update
       run: sudo apt update
     - name: Install apt dependencies
@@ -32,7 +32,7 @@ jobs:
         key: ${{ runner.os }}-libcamera2-${{env.LIBCAMERA-HASH}}
     - name: Generate libcamera
       if: steps.cache-libcamera.outputs.cache-hit != 'true'
-      run: cd /home/runner/work; git clone https://github.com/raspberrypi/libcamera.git; cd libcamera; meson build --buildtype=release -Dpipelines=rpi/vc4 -Dipas=rpi/vc4 -Dv4l2=true -Dtest=false -Dlc-compliance=disabled -Dcam=disabled -Dqcam=enabled -Dpycamera=enabled -Ddocumentation=disabled ; ninja -C build
+      run: cd /home/runner/work; git clone https://github.com/raspberrypi/libcamera.git; cd libcamera; meson build --buildtype=release -Dpipelines=rpi/vc4 -Dipas=rpi/vc4 -Dv4l2=true -Dtest=false -Dlc-compliance=disabled -Dcam=disabled -Dqcam=disabled -Dpycamera=enabled -Ddocumentation=disabled ; ninja -C build
   lint:
     name: lint code
     needs: compile
@@ -42,7 +42,7 @@ jobs:
     - name: Set up Python 3.9
       uses: actions/setup-python@v4
       with:
-        python-version: '3.9.13'
+        python-version: '3.9.19'
     - name: Display python version
       run: python -c "import sys; print(sys.version)"
     - name: Update
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 2b1f357d..5aa2e55e 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -20,5 +20,5 @@ jobs:
         clean: true
 
     - name: Test
-      run: DISPLAY=:0 PYTHONPATH=/home/pi/_work/picamera2/picamera2:/home/pi/libcamera/build/src/py:/home/pi/kmsxx/build/py:/home/pi/python-v4l2 python3 ${{github.workspace}}/tools/run_tests.py -p ${{github.workspace}}
+      run: DISPLAY=:0 ASSET_DIR=/home/pi/assets PYTHONPATH=/home/pi/_work/picamera2/picamera2:/home/pi/libcamera/build/src/py:/home/pi/kmsxx/build/py:/home/pi/python-v4l2 python3 ${{github.workspace}}/tools/run_tests.py -p ${{github.workspace}}
       timeout-minutes: 30
diff --git a/.gitignore b/.gitignore
index 5f968990..10034c26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,4 @@ docs/_build/
 .idea
 /.spyproject
 .spyproject
+hailort.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a868aa4a..ee4b6367 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,75 @@
 
 ### Changed
 
+## 0.3.24 Beta Release 23
+
+### Added
+
+* FfmpegOutput support custom audio filter
+
+### Changed
+
+* Updated for newer version of PyAV which we can use to encode
+  more efficiently.
+
+## 0.3.23 Beta Release 22
+
+### Added
+
+* rpi::ScalerCrops control support
+* Hailo multi-model support
+* Stereo preview example script
+* Add PyavOutput and a new CircularOutput2
+* Allow libav H264 encoder to use V4L2 hardware on VC4 platforms
+* bbox-order argument for imx500_object_detection_demo script
+
+### Changed
+
+* Fix ScalerCrops tests
+* imx500: Update MAX_NUM_TENSORS and MAX_NUM_DIMENSIONS
+* Fix V4L2 encoder not releasing requests
+
+## 0.3.22 Beta Release 21
+
+### Changed
+
+* Add to_tuple methods to libcamera Rectangle and Size types
+* Add IMX500 support
+
+## 0.3.21 Beta Release 20
+
+### Changed
+
+* Fixed setup.py for the devices helpers
+* Fixed handling of the ScalerCrops control in app_full.py
+
+## 0.3.20 Beta Release 19
+
+### Added
+
+* Initial support for Hailo AI devices, including some examples.
+* IMX708 helper class so that the sensor HDR mode can be set with Python.
+
+### Changed
+
+* Improved handling of timeouts when cameras stop responding, including a mechanism for a complete
+  reset if requests stop being returned.
+* Platform checking more robust.
+* Add missing flush parameter to captured_request() (for use with context manager).
+
+## 0.3.19 Beta Release 18
+
+### Added
+
+* Add an example showing how to forward images to other processes using zero-copy.
+* Add a context manager method for capturing requests, e.g. `with picam2.captured_request() as r:`
+* Encoders can skip frames, e.g. run at half the rate of the camera.
+
+### Changed
+
+* Configuration alignment fixed on Pi 5.
+* Improve support for displays without alpha blending.
+
 ## 0.3.18 Beta Release 17
 
 ### Added
diff --git a/README.md b/README.md
index 0842aabb..fc010260 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ There are also many examples in the `examples` folder of this repository, and so
 
 ## Installation
 
-_Picamera2_ is only supported on Raspberry Pi OS Bullseye (or later) images, both 32 and 64-bit. As of September 2022, _Picamera2_ is pre-installed on images downloaded from Raspberry Pi. It works on all Raspberry Pi boards right down to the Pi Zero, although performance in some areas may be worse on less powerful devices.
+_Picamera2_ is only supported on Raspberry Pi OS Bullseye (or later) images, both 32 and 64-bit. As of September 2022, _Picamera2_ is pre-installed on Raspberry Pi OS images, but not on Raspberry Pi OS Lite images. It works on all Raspberry Pi boards right down to the Pi Zero, although performance in some areas may be worse on less powerful devices.
 
 _Picamera2_ is _not_ supported on:
 
@@ -20,50 +20,17 @@ _Picamera2_ is _not_ supported on:
 * Raspberry Pi OS Legacy images.
 * Bullseye (or later) images where the legacy camera stack has been re-enabled.
 
-On Raspberry Pi OS images, _Picamera2_ is now installed with all the GUI (_Qt_ and _OpenGL_) dependencies. On Raspberry Pi OS Lite, it is installed without the GUI dependencies, although preview images can still be displayed using DRM/KMS. If these users wish to use the additional X-Windows GUI features, they will need to run
+On systems where _Picamera2_ is supported but not pre-installed, you can install it with
 ```
-sudo apt install -y python3-pyqt5 python3-opengl
+sudo apt install python3-picamera2 --no-install-recommends
 ```
-(No changes are required to _Picamera2_ itself.)
-
-### Installation using `apt`
-
-`apt` is the recommended way of installing and updating _Picamera2_.
-
-If _Picamera2_ is already installed, you can update it with `sudo apt install -y python3-picamera2`, or as part of a full system update (for example, `sudo apt upgrade`).
-
-If _Picamera2_ is not already installed, then your image is presumably older and you should start with
-```
-sudo apt update
-sudo apt upgrade
-```
-If you have installed _Picamera2_ previously using `pip`, then you should also uninstall this (`pip3 uninstall picamera2`).
-
-Thereafter, you can install _Picamera2_ _with_ all the GUI (_Qt_ and _OpenGL_) dependencies using
+to get a slightly reduced installation with fewer of the window system related elements (this would be suitable for installing on a Raspberry Pi OS Lite system), or
 ```
-sudo apt install -y python3-picamera2
-```
-If you do _not_ want the GUI dependencies, use
-```
-sudo apt install -y python3-picamera2 --no-install-recommends
+sudo apt install python3-picamera2
 ```
+for a full installation.
 
-### Installation using `pip`
-
-This is no longer the recommended way to install _Picamera2_. However, if you want to do so you can use
-```
-sudo apt install -y python3-libcamera python3-kms++
-sudo apt install -y python3-pyqt5 python3-prctl libatlas-base-dev ffmpeg python3-pip
-pip3 install numpy --upgrade
-pip3 install picamera2[gui]
-```
-which will install _Picamera2_ with all the GUI (_Qt_ and _OpenGL_) dependencies. If you do not want these, please use
-```
-sudo apt install -y python3-libcamera python3-kms++
-sudo apt install -y python3-prctl libatlas-base-dev ffmpeg libopenjp2-7 python3-pip
-pip3 install numpy --upgrade
-pip3 install picamera2
-```
+_Picamera2_ can be installed using `pip`, however, we recommend installing through `apt` as this guarantees you will get versions of _Picamera2_ and the underlying _libcamera_ libraries that have been confirmed as working together.
 
 ## Contributing
 
diff --git a/apps/app_full.py b/apps/app_full.py
index dc3e6711..3bd49bab 100755
--- a/apps/app_full.py
+++ b/apps/app_full.py
@@ -1180,6 +1180,7 @@ def toggle_hidden_controls():
     "AfWindows",
     "AfPause",
     "AfMetering",
+    "ScalerCrops"
 }
 
 # Main widgets
diff --git a/examples/ffmpeg_audio_filter.py b/examples/ffmpeg_audio_filter.py
new file mode 100644
index 00000000..e1106915
--- /dev/null
+++ b/examples/ffmpeg_audio_filter.py
@@ -0,0 +1,26 @@
+#!/usr/bin/python3
+import time
+
+from picamera2 import Picamera2
+from picamera2.encoders import H264Encoder
+from picamera2.outputs import FfmpegOutput
+
+picam2 = Picamera2()
+video_config = picam2.create_video_configuration()
+picam2.configure(video_config)
+encoder = H264Encoder(10000000)
+
+# audio filter takes the left channel and copies it to the right channel
+# below example copies c0 (left channel) to c1 (right channel) - convert mono to stereo
+
+# or add audio delay on left channel like this: audio_filter="pan=stereo|adelay=1500|0"
+# source for more examples: https://ffmpeg.org/ffmpeg-filters.html#Examples-2
+output = FfmpegOutput(
+    'ffmpeg_audio_filter_test.mp4',
+    audio=True,
+    audio_filter="pan=stereo|c0=c0|c1=c0"
+)
+
+picam2.start_recording(encoder, output)
+time.sleep(10)
+picam2.stop_recording()
diff --git a/examples/hailo/coco.txt b/examples/hailo/coco.txt
new file mode 100644
index 00000000..1f42c8eb
--- /dev/null
+++ b/examples/hailo/coco.txt
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
\ No newline at end of file
diff --git a/examples/hailo/detect.py b/examples/hailo/detect.py
new file mode 100755
index 00000000..d464cde2
--- /dev/null
+++ b/examples/hailo/detect.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+"""Example module for Hailo Detection."""
+
+import argparse
+
+import cv2
+
+from picamera2 import MappedArray, Picamera2, Preview
+from picamera2.devices import Hailo
+
+
+def extract_detections(hailo_output, w, h, class_names, threshold=0.5):
+    """Extract detections from the HailoRT-postprocess output."""
+    results = []
+    for class_id, detections in enumerate(hailo_output):
+        for detection in detections:
+            score = detection[4]
+            if score >= threshold:
+                y0, x0, y1, x1 = detection[:4]
+                bbox = (int(x0 * w), int(y0 * h), int(x1 * w), int(y1 * h))
+                results.append([class_names[class_id], bbox, score])
+    return results
+
+
+def draw_objects(request):
+    current_detections = detections
+    if current_detections:
+        with MappedArray(request, "main") as m:
+            for class_name, bbox, score in current_detections:
+                x0, y0, x1, y1 = bbox
+                label = f"{class_name} %{int(score * 100)}"
+                cv2.rectangle(m.array, (x0, y0), (x1, y1), (0, 255, 0, 0), 2)
+                cv2.putText(m.array, label, (x0 + 5, y0 + 15),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0, 0), 1, cv2.LINE_AA)
+
+
+if __name__ == "__main__":
+    # Parse command-line arguments.
+    parser = argparse.ArgumentParser(description="Detection Example")
+    parser.add_argument("-m", "--model", help="Path for the HEF model.",
+                        default="/usr/share/hailo-models/yolov8s_h8l.hef")
+    parser.add_argument("-l", "--labels", default="coco.txt",
+                        help="Path to a text file containing labels.")
+    parser.add_argument("-s", "--score_thresh", type=float, default=0.5,
+                        help="Score threshold, must be a float between 0 and 1.")
+    args = parser.parse_args()
+
+    # Get the Hailo model, the input size it wants, and the size of our preview stream.
+    with Hailo(args.model) as hailo:
+        model_h, model_w, _ = hailo.get_input_shape()
+        video_w, video_h = 1280, 960
+
+        # Load class names from the labels file
+        with open(args.labels, 'r', encoding="utf-8") as f:
+            class_names = f.read().splitlines()
+
+        # The list of detected objects to draw.
+        detections = None
+
+        # Configure and start Picamera2.
+        with Picamera2() as picam2:
+            main = {'size': (video_w, video_h), 'format': 'XRGB8888'}
+            lores = {'size': (model_w, model_h), 'format': 'RGB888'}
+            controls = {'FrameRate': 30}
+            config = picam2.create_preview_configuration(main, lores=lores, controls=controls)
+            picam2.configure(config)
+
+            picam2.start_preview(Preview.QTGL, x=0, y=0, width=video_w, height=video_h)
+            picam2.start()
+            picam2.pre_callback = draw_objects
+
+            # Process each low resolution camera frame.
+            while True:
+                frame = picam2.capture_array('lores')
+
+                # Run inference on the preprocessed frame
+                results = hailo.run(frame)
+
+                # Extract detections from the inference results
+                detections = extract_detections(results, video_w, video_h, class_names, args.score_thresh)
diff --git a/examples/hailo/pose.py b/examples/hailo/pose.py
new file mode 100755
index 00000000..85276847
--- /dev/null
+++ b/examples/hailo/pose.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import argparse
+
+import cv2
+from pose_utils import postproc_yolov8_pose
+
+from picamera2 import MappedArray, Picamera2, Preview
+from picamera2.devices import Hailo
+
+parser = argparse.ArgumentParser(description='Pose estimation using Hailo')
+parser.add_argument('-m', '--model', help="HEF file path", default="/usr/share/hailo-models/yolov8s_pose_h8l_pi.hef")
+args = parser.parse_args()
+
+NOSE, L_EYE, R_EYE, L_EAR, R_EAR, L_SHOULDER, R_SHOULDER, L_ELBOW, R_ELBOW, \
+    L_WRIST, R_WRIST, L_HIP, R_HIP, L_KNEE, R_KNEE, L_ANKLE, R_ANKLE = range(17)
+
+JOINT_PAIRS = [[NOSE, L_EYE], [L_EYE, L_EAR], [NOSE, R_EYE], [R_EYE, R_EAR],
+               [L_SHOULDER, R_SHOULDER],
+               [L_SHOULDER, L_ELBOW], [L_ELBOW, L_WRIST], [R_SHOULDER, R_ELBOW], [R_ELBOW, R_WRIST],
+               [L_SHOULDER, L_HIP], [R_SHOULDER, R_HIP], [L_HIP, R_HIP],
+               [L_HIP, L_KNEE], [R_HIP, R_KNEE], [L_KNEE, L_ANKLE], [R_KNEE, R_ANKLE]]
+
+
+def visualize_pose_estimation_result(results, image, model_size, detection_threshold=0.5, joint_threshold=0.5):
+    image_size = (image.shape[1], image.shape[0])
+
+    def scale_coord(coord):
+        return tuple([int(c * t / f) for c, f, t in zip(coord, model_size, image_size)])
+
+    bboxes, scores, keypoints, joint_scores = (
+        results['bboxes'], results['scores'], results['keypoints'], results['joint_scores'])
+    box, score, keypoint, keypoint_score = bboxes[0], scores[0], keypoints[0], joint_scores[0]
+
+    for detection_box, detection_score, detection_keypoints, detection_keypoints_score in (
+            zip(box, score, keypoint, keypoint_score)):
+        if detection_score < detection_threshold:
+            continue
+
+        coord_min = scale_coord(detection_box[:2])
+        coord_max = scale_coord(detection_box[2:])
+        cv2.rectangle(image, coord_min, coord_max, (255, 0, 0), 1)
+        cv2.putText(image, str(detection_score), coord_min, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (36, 255, 12), 1)
+
+        joint_visible = detection_keypoints_score > joint_threshold
+
+        detection_keypoints = detection_keypoints.reshape(17, 2)
+        for joint, joint_score in zip(detection_keypoints, detection_keypoints_score):
+            if joint_score > joint_threshold:
+                cv2.circle(image, scale_coord(joint), 4, (255, 0, 255), -1)
+
+        for joint0, joint1 in JOINT_PAIRS:
+            if joint_visible[joint0] and joint_visible[joint1]:
+                cv2.line(image, scale_coord(detection_keypoints[joint0]),
+                         scale_coord(detection_keypoints[joint1]), (255, 0, 255), 3)
+
+
+def draw_predictions(request):
+    with MappedArray(request, 'main') as m:
+        predictions = last_predictions
+        if predictions:
+            visualize_pose_estimation_result(predictions, m.array, model_size)
+
+
+# ---------------- Start of the example --------------------- #
+
+last_predictions = None
+
+with Hailo(args.model) as hailo:
+    main_size = (1024, 768)
+    model_h, model_w, _ = hailo.get_input_shape()
+    model_size = lores_size = (model_w, model_h)
+
+    with Picamera2() as picam2:
+        main = {'size': main_size, 'format': 'XRGB8888'}
+        lores = {'size': lores_size, 'format': 'RGB888'}
+        config = picam2.create_video_configuration(main, lores=lores)
+        picam2.configure(config)
+
+        picam2.start_preview(Preview.QTGL, x=0, y=0, width=main_size[0], height=main_size[1])
+        picam2.start()
+        picam2.pre_callback = draw_predictions
+
+        while True:
+            frame = picam2.capture_array('lores')
+
+            # Do pose estimation.
+            raw_detections = hailo.run(frame)
+
+            # Tidy up the predictions. num_of_classes is always 1 (?).
+            last_predictions = postproc_yolov8_pose(1, raw_detections, model_size)
diff --git a/examples/hailo/pose_utils.py b/examples/hailo/pose_utils.py
new file mode 100644
index 00000000..39b52f98
--- /dev/null
+++ b/examples/hailo/pose_utils.py
@@ -0,0 +1,290 @@
+import numpy as np
+
+kwargs = {
+    'classes': 1,
+    'nms_max_output_per_class': 300,
+    'anchors': {'regression_length': 15, 'strides': [8, 16, 32]},
+    'score_threshold': 0.001,
+    'nms_iou_thresh': 0.7,
+    'meta_arch': 'nanodet_v8',
+    'device_pre_post_layers': None
+}
+
+
+def postproc_yolov8_pose(num_of_classes, raw_detections, img_size):
+    # The input is a dictionary of outputs for each layer. For each layer we may have:
+    #     A single numpy array, if batching was not used.
+    #     A list of numpy arrays, when a batch size was specified.
+    # We convert the "list" into an extra numpy dimensions, which is what the code here expects.
+    for layer, output in raw_detections.items():
+        if not isinstance(output, list):
+            raw_detections[layer] = np.expand_dims(output, axis=0)
+        elif len(output) == 1:
+            raw_detections[layer] = np.expand_dims(output[0], axis=0)
+        else:
+            raise RuntimeError("Pose post-processing only supports a batch size of 1")
+
+    kwargs['img_dims'] = img_size
+    raw_detections_keys = list(raw_detections.keys())
+    layer_from_shape: dict = {raw_detections[key].shape: key for key in raw_detections_keys}
+
+    detection_output_channels = (kwargs['anchors']['regression_length'] + 1) * 4  # (regression length + 1) * num_coordinates
+    keypoints = 51
+
+    # The following assumes that the batch size is 1:
+    endnodes = [raw_detections[layer_from_shape[1, 20, 20, detection_output_channels]],
+                raw_detections[layer_from_shape[1, 20, 20, num_of_classes]],
+                raw_detections[layer_from_shape[1, 20, 20, keypoints]],
+                raw_detections[layer_from_shape[1, 40, 40, detection_output_channels]],
+                raw_detections[layer_from_shape[1, 40, 40, num_of_classes]],
+                raw_detections[layer_from_shape[1, 40, 40, keypoints]],
+                raw_detections[layer_from_shape[1, 80, 80, detection_output_channels]],
+                raw_detections[layer_from_shape[1, 80, 80, num_of_classes]],
+                raw_detections[layer_from_shape[1, 80, 80, keypoints]]]
+
+    predictions_dict = yolov8_pose_estimation_postprocess(endnodes, **kwargs)
+
+    return predictions_dict
+
+
+# ---------------- Architecture functions ----------------- #
+
+def _sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+
+
+def _softmax(x):
+    return np.exp(x) / np.expand_dims(np.sum(np.exp(x), axis=-1), axis=-1)
+
+
+def max(a, b):
+    return a if a >= b else b
+
+
+def min(a, b):
+    return a if a <= b else b
+
+
+def nms(dets, thresh):
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = np.argsort(scores)[::-1]
+
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=int)
+
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+
+    return np.where(suppressed == 0)[0]
+
+
+def _yolov8_decoding(raw_boxes, raw_kpts, strides, image_dims, reg_max):
+    boxes = None
+    decoded_kpts = None
+
+    for box_distribute, kpts, stride, _ in zip(raw_boxes, raw_kpts, strides, np.arange(3)):
+        # create grid
+        shape = [int(x / stride) for x in image_dims]
+        grid_x = np.arange(shape[1]) + 0.5
+        grid_y = np.arange(shape[0]) + 0.5
+        grid_x, grid_y = np.meshgrid(grid_x, grid_y)
+        ct_row = grid_y.flatten() * stride
+        ct_col = grid_x.flatten() * stride
+        center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1)
+
+        # box distribution to distance
+        reg_range = np.arange(reg_max + 1)
+        box_distribute = np.reshape(
+            box_distribute, (-1, box_distribute.shape[1] * box_distribute.shape[2], 4, reg_max + 1))
+        box_distance = _softmax(box_distribute)
+        box_distance = box_distance * np.reshape(reg_range, (1, 1, 1, -1))
+        box_distance = np.sum(box_distance, axis=-1)
+        box_distance = box_distance * stride
+
+        # decode box
+        box_distance = np.concatenate([box_distance[:, :, :2] * (-1), box_distance[:, :, 2:]], axis=-1)
+        decode_box = np.expand_dims(center, axis=0) + box_distance
+
+        xmin = decode_box[:, :, 0]
+        ymin = decode_box[:, :, 1]
+        xmax = decode_box[:, :, 2]
+        ymax = decode_box[:, :, 3]
+        decode_box = np.transpose([xmin, ymin, xmax, ymax], [1, 2, 0])
+
+        xywh_box = np.transpose([(xmin + xmax) / 2, (ymin + ymax) / 2, xmax - xmin, ymax - ymin], [1, 2, 0])
+        boxes = xywh_box if boxes is None else np.concatenate([boxes, xywh_box], axis=1)
+
+        # kpts decoding
+        kpts[..., :2] *= 2
+        kpts[..., :2] = stride * (kpts[..., :2] - 0.5) + np.expand_dims(center[..., :2], axis=1)
+
+        decoded_kpts = kpts if decoded_kpts is None else np.concatenate([decoded_kpts, kpts], axis=1)
+
+    return boxes, decoded_kpts
+
+
+def xywh2xyxy(x):
+    y = np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2
+    y[:, 1] = x[:, 1] - x[:, 3] / 2
+    y[:, 2] = x[:, 0] + x[:, 2] / 2
+    y[:, 3] = x[:, 1] + x[:, 3] / 2
+    return y
+
+
+def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.45,
+                        max_det=100, n_kpts=17):
+    """Non-Maximum Suppression (NMS) on inference results to reject overlapping detections.
+
+    Args:
+        prediction: numpy.ndarray with shape (batch_size, num_proposals, 56)
+        conf_thres: confidence threshold for NMS
+        iou_thres: IoU threshold for NMS
+        max_det: Maximal number of detections to keep after NMS
+        nm: Number of masks
+        multi_label: Consider only best class per proposal or all conf_thresh passing proposals
+
+    Returns:
+        A list of per image detections, where each is a dictionary with the following structure:
+        {
+        'detection_boxes':  numpy.ndarray with shape (num_detections, 4),
+        'keypoints':        numpy.ndarray with shape (num_detections, 17, 3),
+        'detection_scores': numpy.ndarray with shape (num_detections, 1),
+        'num_detections':   int
+        }
+    """
+    assert 0 <= conf_thres <= 1, \
+        f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
+    assert 0 <= iou_thres <= 1, \
+        f'Invalid IoU threshold {iou_thres}, valid values are between 0.0 and 1.0'
+
+    nc = prediction.shape[2] - n_kpts * 3 - 4  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # max_wh = 7680  # (pixels) maximum box width and height
+    ki = 4 + nc  # keypoints start index
+    output = []
+    for xi, x in enumerate(prediction):  # image index, image inference
+        x = x[xc[xi]]
+        # If none remain process next image
+        if not x.shape[0]:
+            output.append({'bboxes': np.zeros((0, 4)),
+                           'keypoints': np.zeros((0, n_kpts, 3)),
+                           'scores': np.zeros((0)),
+                           'num_detections': 0})
+            continue
+
+        # (center_x, center_y, width, height) to (x1, y1, x2, y2)
+        boxes = xywh2xyxy(x[:, :4])
+        kpts = x[:, ki:]
+
+        conf = np.expand_dims(x[:, 4:ki].max(1), 1)
+        j = np.expand_dims(x[:, 4:ki].argmax(1), 1).astype(np.float32)
+
+        keep = np.squeeze(conf, 1) > conf_thres
+        x = np.concatenate((boxes, conf, j, kpts), 1)[keep]
+
+        # sort by confidence
+        x = x[x[:, 4].argsort()[::-1]]
+
+        boxes = x[:, :4]
+        conf = x[:, 4:5]
+        preds = np.hstack([boxes.astype(np.float32), conf.astype(np.float32)])
+
+        keep = nms(preds, iou_thres)
+        if keep.shape[0] > max_det:
+            keep = keep[:max_det]
+
+        out = x[keep]
+        scores = out[:, 4]
+        boxes = out[:, :4]
+        kpts = out[:, 6:]
+        kpts = np.reshape(kpts, (-1, n_kpts, 3))
+
+        out = {'bboxes': boxes,
+               'keypoints': kpts,
+               'scores': scores,
+               'num_detections': int(scores.shape[0])}
+
+        output.append(out)
+    return output
+
+
+def yolov8_pose_estimation_postprocess(endnodes, **kwargs):
+    """Decode and run NMS on Yolov8 pose estimation output.
+
+    endnodes is a list of 10 tensors:
+        endnodes[0]:  bbox output with shapes (BS, 20, 20, 64)
+        endnodes[1]:  scores output with shapes (BS, 20, 20, 80)
+        endnodes[2]:  keypoints output with shapes (BS, 20, 20, 51)
+        endnodes[3]:  bbox output with shapes (BS, 40, 40, 64)
+        endnodes[4]:  scores output with shapes (BS, 40, 40, 80)
+        endnodes[5]:  keypoints output with shapes (BS, 40, 40, 51)
+        endnodes[6]:  bbox output with shapes (BS, 80, 80, 64)
+        endnodes[7]:  scores output with shapes (BS, 80, 80, 80)
+        endnodes[8]:  keypoints output with shapes (BS, 80, 80, 51)
+
+    Returns:
+        A list of per image detections, where each is a dictionary with the following structure:
+        {
+        'detection_boxes':   numpy.ndarray with shape (num_detections, 4),
+        'keypoints':         numpy.ndarray with shape (num_detections, 3),
+        'detection_classes': numpy.ndarray with shape (num_detections, 80),
+        'detection_scores':  numpy.ndarray with shape (num_detections, 80)
+        }
+    """
+    batch_size = endnodes[0].shape[0]
+    num_classes = kwargs['classes']  # always 1
+    max_detections = kwargs['nms_max_output_per_class']
+    strides = kwargs['anchors']['strides'][::-1]
+    image_dims = tuple(kwargs['img_dims'])
+    reg_max = kwargs['anchors']['regression_length']
+    raw_boxes = endnodes[:7:3]
+    scores = [np.reshape(s, (-1, s.shape[1] * s.shape[2], num_classes)) for s in endnodes[1:8:3]]
+    scores = np.concatenate(scores, axis=1)
+    kpts = [np.reshape(c, (-1, c.shape[1] * c.shape[2], 17, 3)) for c in endnodes[2:9:3]]
+    decoded_boxes, decoded_kpts = _yolov8_decoding(raw_boxes, kpts, strides, image_dims, reg_max)
+    score_thres = kwargs['score_threshold']
+    iou_thres = kwargs['nms_iou_thresh']
+
+    decoded_kpts = np.reshape(decoded_kpts, (batch_size, -1, 51))
+    predictions = np.concatenate([decoded_boxes, scores, decoded_kpts], axis=2)
+    nms_res = non_max_suppression(predictions, conf_thres=score_thres, iou_thres=iou_thres, max_det=max_detections)
+    output = {}
+    output['bboxes'] = np.zeros((batch_size, max_detections, 4))
+    output['keypoints'] = np.zeros((batch_size, max_detections, 17, 2))
+    output['joint_scores'] = np.zeros((batch_size, max_detections, 17, 1))
+    output['scores'] = np.zeros((batch_size, max_detections, 1))
+    for b in range(batch_size):
+        output['bboxes'][b, :nms_res[b]['num_detections']] = nms_res[b]['bboxes']
+        output['keypoints'][b, :nms_res[b]['num_detections']] = nms_res[b]['keypoints'][..., :2]
+        output['joint_scores'][b, :nms_res[b]['num_detections'], ..., 0] = _sigmoid(nms_res[b]['keypoints'][..., 2])
+        output['scores'][b, :nms_res[b]['num_detections'], ..., 0] = nms_res[b]['scores']
+    return output
diff --git a/examples/imx500/assets/coco_labels.txt b/examples/imx500/assets/coco_labels.txt
new file mode 100644
index 00000000..a76dd2a2
--- /dev/null
+++ b/examples/imx500/assets/coco_labels.txt
@@ -0,0 +1,90 @@
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+-
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+-
+backpack
+umbrella
+-
+-
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+-
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+-
+dining table
+-
+-
+toilet
+-
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+-
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/examples/imx500/assets/colours.txt b/examples/imx500/assets/colours.txt
new file mode 100644
index 00000000..02141d11
--- /dev/null
+++ b/examples/imx500/assets/colours.txt
@@ -0,0 +1,21 @@
+128 0 0 255
+0 128 0 255
+128 128 0 255
+0 0 128 255
+128 0 128 255
+0 128 128 255
+128 128 128 255
+64 0 0 255
+192 0 0 255
+64 128 0 255
+192 128 0 255
+64 0 128 255
+192 0 128 255
+64 128 128 255
+192 128 128 255
+0 64 0 255
+128 64 0 255
+0 192 0 255
+128 192 0 255
+0 64 128 255
+0 0 0 255
\ No newline at end of file
diff --git a/examples/imx500/assets/imagenet_labels.txt b/examples/imx500/assets/imagenet_labels.txt
new file mode 100644
index 00000000..17459f63
--- /dev/null
+++ b/examples/imx500/assets/imagenet_labels.txt
@@ -0,0 +1,1001 @@
+0:background
+1:tench, Tinca tinca
+2:goldfish, Carassius auratus
+3:great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+4:tiger shark, Galeocerdo cuvieri
+5:hammerhead, hammerhead shark
+6:electric ray, crampfish, numbfish, torpedo
+7:stingray
+8:cock
+9:hen
+10:ostrich, Struthio camelus
+11:brambling, Fringilla montifringilla
+12:goldfinch, Carduelis carduelis
+13:house finch, linnet, Carpodacus mexicanus
+14:junco, snowbird
+15:indigo bunting, indigo finch, indigo bird, Passerina cyanea
+16:robin, American robin, Turdus migratorius
+17:bulbul
+18:jay
+19:magpie
+20:chickadee
+21:water ouzel, dipper
+22:kite
+23:bald eagle, American eagle, Haliaeetus leucocephalus
+24:vulture
+25:great grey owl, great gray owl, Strix nebulosa
+26:European fire salamander, Salamandra salamandra
+27:common newt, Triturus vulgaris
+28:eft
+29:spotted salamander, Ambystoma maculatum
+30:axolotl, mud puppy, Ambystoma mexicanum
+31:bullfrog, Rana catesbeiana
+32:tree frog, tree-frog
+33:tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+34:loggerhead, loggerhead turtle, Caretta caretta
+35:leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+36:mud turtle
+37:terrapin
+38:box turtle, box tortoise
+39:banded gecko
+40:common iguana, iguana, Iguana iguana
+41:American chameleon, anole, Anolis carolinensis
+42:whiptail, whiptail lizard
+43:agama
+44:frilled lizard, Chlamydosaurus kingi
+45:alligator lizard
+46:Gila monster, Heloderma suspectum
+47:green lizard, Lacerta viridis
+48:African chameleon, Chamaeleo chamaeleon
+49:Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+50:African crocodile, Nile crocodile, Crocodylus niloticus
+51:American alligator, Alligator mississipiensis
+52:triceratops
+53:thunder snake, worm snake, Carphophis amoenus
+54:ringneck snake, ring-necked snake, ring snake
+55:hognose snake, puff adder, sand viper
+56:green snake, grass snake
+57:king snake, kingsnake
+58:garter snake, grass snake
+59:water snake
+60:vine snake
+61:night snake, Hypsiglena torquata
+62:boa constrictor, Constrictor constrictor
+63:rock python, rock snake, Python sebae
+64:Indian cobra, Naja naja
+65:green mamba
+66:sea snake
+67:horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+68:diamondback, diamondback rattlesnake, Crotalus adamanteus
+69:sidewinder, horned rattlesnake, Crotalus cerastes
+70:trilobite
+71:harvestman, daddy longlegs, Phalangium opilio
+72:scorpion
+73:black and gold garden spider, Argiope aurantia
+74:barn spider, Araneus cavaticus
+75:garden spider, Aranea diademata
+76:black widow, Latrodectus mactans
+77:tarantula
+78:wolf spider, hunting spider
+79:tick
+80:centipede
+81:black grouse
+82:ptarmigan
+83:ruffed grouse, partridge, Bonasa umbellus
+84:prairie chicken, prairie grouse, prairie fowl
+85:peacock
+86:quail
+87:partridge
+88:African grey, African gray, Psittacus erithacus
+89:macaw
+90:sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+91:lorikeet
+92:coucal
+93:bee eater
+94:hornbill
+95:hummingbird
+96:jacamar
+97:toucan
+98:drake
+99:red-breasted merganser, Mergus serrator
+100:goose
+101:black swan, Cygnus atratus
+102:tusker
+103:echidna, spiny anteater, anteater
+104:platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+105:wallaby, brush kangaroo
+106:koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+107:wombat
+108:jellyfish
+109:sea anemone, anemone
+110:brain coral
+111:flatworm, platyhelminth
+112:nematode, nematode worm, roundworm
+113:conch
+114:snail
+115:slug
+116:sea slug, nudibranch
+117:chiton, coat-of-mail shell, sea cradle, polyplacophore
+118:chambered nautilus, pearly nautilus, nautilus
+119:Dungeness crab, Cancer magister
+120:rock crab, Cancer irroratus
+121:fiddler crab
+122:king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+123:American lobster, Northern lobster, Maine lobster, Homarus americanus
+124:spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+125:crayfish, crawfish, crawdad, crawdaddy
+126:hermit crab
+127:isopod
+128:white stork, Ciconia ciconia
+129:black stork, Ciconia nigra
+130:spoonbill
+131:flamingo
+132:little blue heron, Egretta caerulea
+133:American egret, great white heron, Egretta albus
+134:bittern
+135:crane
+136:limpkin, Aramus pictus
+137:European gallinule, Porphyrio porphyrio
+138:American coot, marsh hen, mud hen, water hen, Fulica americana
+139:bustard
+140:ruddy turnstone, Arenaria interpres
+141:red-backed sandpiper, dunlin, Erolia alpina
+142:redshank, Tringa totanus
+143:dowitcher
+144:oystercatcher, oyster catcher
+145:pelican
+146:king penguin, Aptenodytes patagonica
+147:albatross, mollymawk
+148:grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+149:killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+150:dugong, Dugong dugon
+151:sea lion
+152:Chihuahua
+153:Japanese spaniel
+154:Maltese dog, Maltese terrier, Maltese
+155:Pekinese, Pekingese, Peke
+156:Shih-Tzu
+157:Blenheim spaniel
+158:papillon
+159:toy terrier
+160:Rhodesian ridgeback
+161:Afghan hound, Afghan
+162:basset, basset hound
+163:beagle
+164:bloodhound, sleuthhound
+165:bluetick
+166:black-and-tan coonhound
+167:Walker hound, Walker foxhound
+168:English foxhound
+169:redbone
+170:borzoi, Russian wolfhound
+171:Irish wolfhound
+172:Italian greyhound
+173:whippet
+174:Ibizan hound, Ibizan Podenco
+175:Norwegian elkhound, elkhound
+176:otterhound, otter hound
+177:Saluki, gazelle hound
+178:Scottish deerhound, deerhound
+179:Weimaraner
+180:Staffordshire bullterrier, Staffordshire bull terrier
+181:American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+182:Bedlington terrier
+183:Border terrier
+184:Kerry blue terrier
+185:Irish terrier
+186:Norfolk terrier
+187:Norwich terrier
+188:Yorkshire terrier
+189:wire-haired fox terrier
+190:Lakeland terrier
+191:Sealyham terrier, Sealyham
+192:Airedale, Airedale terrier
+193:cairn, cairn terrier
+194:Australian terrier
+195:Dandie Dinmont, Dandie Dinmont terrier
+196:Boston bull, Boston terrier
+197:miniature schnauzer
+198:giant schnauzer
+199:standard schnauzer
+200:Scotch terrier, Scottish terrier, Scottie
+201:Tibetan terrier, chrysanthemum dog
+202:silky terrier, Sydney silky
+203:soft-coated wheaten terrier
+204:West Highland white terrier
+205:Lhasa, Lhasa apso
+206:flat-coated retriever
+207:curly-coated retriever
+208:golden retriever
+209:Labrador retriever
+210:Chesapeake Bay retriever
+211:German short-haired pointer
+212:vizsla, Hungarian pointer
+213:English setter
+214:Irish setter, red setter
+215:Gordon setter
+216:Brittany spaniel
+217:clumber, clumber spaniel
+218:English springer, English springer spaniel
+219:Welsh springer spaniel
+220:cocker spaniel, English cocker spaniel, cocker
+221:Sussex spaniel
+222:Irish water spaniel
+223:kuvasz
+224:schipperke
+225:groenendael
+226:malinois
+227:briard
+228:kelpie
+229:komondor
+230:Old English sheepdog, bobtail
+231:Shetland sheepdog, Shetland sheep dog, Shetland
+232:collie
+233:Border collie
+234:Bouvier des Flandres, Bouviers des Flandres
+235:Rottweiler
+236:German shepherd, German shepherd dog, German police dog, alsatian
+237:Doberman, Doberman pinscher
+238:miniature pinscher
+239:Greater Swiss Mountain dog
+240:Bernese mountain dog
+241:Appenzeller
+242:EntleBucher
+243:boxer
+244:bull mastiff
+245:Tibetan mastiff
+246:French bulldog
+247:Great Dane
+248:Saint Bernard, St Bernard
+249:Eskimo dog, husky
+250:malamute, malemute, Alaskan malamute
+251:Siberian husky
+252:dalmatian, coach dog, carriage dog
+253:affenpinscher, monkey pinscher, monkey dog
+254:basenji
+255:pug, pug-dog
+256:Leonberg
+257:Newfoundland, Newfoundland dog
+258:Great Pyrenees
+259:Samoyed, Samoyede
+260:Pomeranian
+261:chow, chow chow
+262:keeshond
+263:Brabancon griffon
+264:Pembroke, Pembroke Welsh corgi
+265:Cardigan, Cardigan Welsh corgi
+266:toy poodle
+267:miniature poodle
+268:standard poodle
+269:Mexican hairless
+270:timber wolf, grey wolf, gray wolf, Canis lupus
+271:white wolf, Arctic wolf, Canis lupus tundrarum
+272:red wolf, maned wolf, Canis rufus, Canis niger
+273:coyote, prairie wolf, brush wolf, Canis latrans
+274:dingo, warrigal, warragal, Canis dingo
+275:dhole, Cuon alpinus
+276:African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+277:hyena, hyaena
+278:red fox, Vulpes vulpes
+279:kit fox, Vulpes macrotis
+280:Arctic fox, white fox, Alopex lagopus
+281:grey fox, gray fox, Urocyon cinereoargenteus
+282:tabby, tabby cat
+283:tiger cat
+284:Persian cat
+285:Siamese cat, Siamese
+286:Egyptian cat
+287:cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+288:lynx, catamount
+289:leopard, Panthera pardus
+290:snow leopard, ounce, Panthera uncia
+291:jaguar, panther, Panthera onca, Felis onca
+292:lion, king of beasts, Panthera leo
+293:tiger, Panthera tigris
+294:cheetah, chetah, Acinonyx jubatus
+295:brown bear, bruin, Ursus arctos
+296:American black bear, black bear, Ursus americanus, Euarctos americanus
+297:ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+298:sloth bear, Melursus ursinus, Ursus ursinus
+299:mongoose
+300:meerkat, mierkat
+301:tiger beetle
+302:ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+303:ground beetle, carabid beetle
+304:long-horned beetle, longicorn, longicorn beetle
+305:leaf beetle, chrysomelid
+306:dung beetle
+307:rhinoceros beetle
+308:weevil
+309:fly
+310:bee
+311:ant, emmet, pismire
+312:grasshopper, hopper
+313:cricket
+314:walking stick, walkingstick, stick insect
+315:cockroach, roach
+316:mantis, mantid
+317:cicada, cicala
+318:leafhopper
+319:lacewing, lacewing fly
+320:dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+321:damselfly
+322:admiral
+323:ringlet, ringlet butterfly
+324:monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+325:cabbage butterfly
+326:sulphur butterfly, sulfur butterfly
+327:lycaenid, lycaenid butterfly
+328:starfish, sea star
+329:sea urchin
+330:sea cucumber, holothurian
+331:wood rabbit, cottontail, cottontail rabbit
+332:hare
+333:Angora, Angora rabbit
+334:hamster
+335:porcupine, hedgehog
+336:fox squirrel, eastern fox squirrel, Sciurus niger
+337:marmot
+338:beaver
+339:guinea pig, Cavia cobaya
+340:sorrel
+341:zebra
+342:hog, pig, grunter, squealer, Sus scrofa
+343:wild boar, boar, Sus scrofa
+344:warthog
+345:hippopotamus, hippo, river horse, Hippopotamus amphibius
+346:ox
+347:water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+348:bison
+349:ram, tup
+350:bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+351:ibex, Capra ibex
+352:hartebeest
+353:impala, Aepyceros melampus
+354:gazelle
+355:Arabian camel, dromedary, Camelus dromedarius
+356:llama
+357:weasel
+358:mink
+359:polecat, fitch, foulmart, foumart, Mustela putorius
+360:black-footed ferret, ferret, Mustela nigripes
+361:otter
+362:skunk, polecat, wood pussy
+363:badger
+364:armadillo
+365:three-toed sloth, ai, Bradypus tridactylus
+366:orangutan, orang, orangutang, Pongo pygmaeus
+367:gorilla, Gorilla gorilla
+368:chimpanzee, chimp, Pan troglodytes
+369:gibbon, Hylobates lar
+370:siamang, Hylobates syndactylus, Symphalangus syndactylus
+371:guenon, guenon monkey
+372:patas, hussar monkey, Erythrocebus patas
+373:baboon
+374:macaque
+375:langur
+376:colobus, colobus monkey
+377:proboscis monkey, Nasalis larvatus
+378:marmoset
+379:capuchin, ringtail, Cebus capucinus
+380:howler monkey, howler
+381:titi, titi monkey
+382:spider monkey, Ateles geoffroyi
+383:squirrel monkey, Saimiri sciureus
+384:Madagascar cat, ring-tailed lemur, Lemur catta
+385:indri, indris, Indri indri, Indri brevicaudatus
+386:Indian elephant, Elephas maximus
+387:African elephant, Loxodonta africana
+388:lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+389:giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+390:barracouta, snoek
+391:eel
+392:coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+393:rock beauty, Holocanthus tricolor
+394:anemone fish
+395:sturgeon
+396:gar, garfish, garpike, billfish, Lepisosteus osseus
+397:lionfish
+398:puffer, pufferfish, blowfish, globefish
+399:abacus
+400:abaya
+401:academic gown, academic robe, judge's robe
+402:accordion, piano accordion, squeeze box
+403:acoustic guitar
+404:aircraft carrier, carrier, flattop, attack aircraft carrier
+405:airliner
+406:airship, dirigible
+407:altar
+408:ambulance
+409:amphibian, amphibious vehicle
+410:analog clock
+411:apiary, bee house
+412:apron
+413:ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+414:assault rifle, assault gun
+415:backpack, back pack, knapsack, packsack, rucksack, haversack
+416:bakery, bakeshop, bakehouse
+417:balance beam, beam
+418:balloon
+419:ballpoint, ballpoint pen, ballpen, Biro
+420:Band Aid
+421:banjo
+422:bannister, banister, balustrade, balusters, handrail
+423:barbell
+424:barber chair
+425:barbershop
+426:barn
+427:barometer
+428:barrel, cask
+429:barrow, garden cart, lawn cart, wheelbarrow
+430:baseball
+431:basketball
+432:bassinet
+433:bassoon
+434:bathing cap, swimming cap
+435:bath towel
+436:bathtub, bathing tub, bath, tub
+437:beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+438:beacon, lighthouse, beacon light, pharos
+439:beaker
+440:bearskin, busby, shako
+441:beer bottle
+442:beer glass
+443:bell cote, bell cot
+444:bib
+445:bicycle-built-for-two, tandem bicycle, tandem
+446:bikini, two-piece
+447:binder, ring-binder
+448:binoculars, field glasses, opera glasses
+449:birdhouse
+450:boathouse
+451:bobsled, bobsleigh, bob
+452:bolo tie, bolo, bola tie, bola
+453:bonnet, poke bonnet
+454:bookcase
+455:bookshop, bookstore, bookstall
+456:bottlecap
+457:bow
+458:bow tie, bow-tie, bowtie
+459:brass, memorial tablet, plaque
+460:brassiere, bra, bandeau
+461:breakwater, groin, groyne, mole, bulwark, seawall, jetty
+462:breastplate, aegis, egis
+463:broom
+464:bucket, pail
+465:buckle
+466:bulletproof vest
+467:bullet train, bullet
+468:butcher shop, meat market
+469:cab, hack, taxi, taxicab
+470:caldron, cauldron
+471:candle, taper, wax light
+472:cannon
+473:canoe
+474:can opener, tin opener
+475:cardigan
+476:car mirror
+477:carousel, carrousel, merry-go-round, roundabout, whirligig
+478:carpenter's kit, tool kit
+479:carton
+480:car wheel
+481:cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+482:cassette
+483:cassette player
+484:castle
+485:catamaran
+486:CD player
+487:cello, violoncello
+488:cellular telephone, cellular phone, cellphone, cell, mobile phone
+489:chain
+490:chainlink fence
+491:chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+492:chain saw, chainsaw
+493:chest
+494:chiffonier, commode
+495:chime, bell, gong
+496:china cabinet, china closet
+497:Christmas stocking
+498:church, church building
+499:cinema, movie theater, movie theatre, movie house, picture palace
+500:cleaver, meat cleaver, chopper
+501:cliff dwelling
+502:cloak
+503:clog, geta, patten, sabot
+504:cocktail shaker
+505:coffee mug
+506:coffeepot
+507:coil, spiral, volute, whorl, helix
+508:combination lock
+509:computer keyboard, keypad
+510:confectionery, confectionary, candy store
+511:container ship, containership, container vessel
+512:convertible
+513:corkscrew, bottle screw
+514:cornet, horn, trumpet, trump
+515:cowboy boot
+516:cowboy hat, ten-gallon hat
+517:cradle
+518:crane
+519:crash helmet
+520:crate
+521:crib, cot
+522:Crock Pot
+523:croquet ball
+524:crutch
+525:cuirass
+526:dam, dike, dyke
+527:desk
+528:desktop computer
+529:dial telephone, dial phone
+530:diaper, nappy, napkin
+531:digital clock
+532:digital watch
+533:dining table, board
+534:dishrag, dishcloth
+535:dishwasher, dish washer, dishwashing machine
+536:disk brake, disc brake
+537:dock, dockage, docking facility
+538:dogsled, dog sled, dog sleigh
+539:dome
+540:doormat, welcome mat
+541:drilling platform, offshore rig
+542:drum, membranophone, tympan
+543:drumstick
+544:dumbbell
+545:Dutch oven
+546:electric fan, blower
+547:electric guitar
+548:electric locomotive
+549:entertainment center
+550:envelope
+551:espresso maker
+552:face powder
+553:feather boa, boa
+554:file, file cabinet, filing cabinet
+555:fireboat
+556:fire engine, fire truck
+557:fire screen, fireguard
+558:flagpole, flagstaff
+559:flute, transverse flute
+560:folding chair
+561:football helmet
+562:forklift
+563:fountain
+564:fountain pen
+565:four-poster
+566:freight car
+567:French horn, horn
+568:frying pan, frypan, skillet
+569:fur coat
+570:garbage truck, dustcart
+571:gasmask, respirator, gas helmet
+572:gas pump, gasoline pump, petrol pump, island dispenser
+573:goblet
+574:go-kart
+575:golf ball
+576:golfcart, golf cart
+577:gondola
+578:gong, tam-tam
+579:gown
+580:grand piano, grand
+581:greenhouse, nursery, glasshouse
+582:grille, radiator grille
+583:grocery store, grocery, food market, market
+584:guillotine
+585:hair slide
+586:hair spray
+587:half track
+588:hammer
+589:hamper
+590:hand blower, blow dryer, blow drier, hair dryer, hair drier
+591:hand-held computer, hand-held microcomputer
+592:handkerchief, hankie, hanky, hankey
+593:hard disc, hard disk, fixed disk
+594:harmonica, mouth organ, harp, mouth harp
+595:harp
+596:harvester, reaper
+597:hatchet
+598:holster
+599:home theater, home theatre
+600:honeycomb
+601:hook, claw
+602:hoopskirt, crinoline
+603:horizontal bar, high bar
+604:horse cart, horse-cart
+605:hourglass
+606:iPod
+607:iron, smoothing iron
+608:jack-o'-lantern
+609:jean, blue jean, denim
+610:jeep, landrover
+611:jersey, T-shirt, tee shirt
+612:jigsaw puzzle
+613:jinrikisha, ricksha, rickshaw
+614:joystick
+615:kimono
+616:knee pad
+617:knot
+618:lab coat, laboratory coat
+619:ladle
+620:lampshade, lamp shade
+621:laptop, laptop computer
+622:lawn mower, mower
+623:lens cap, lens cover
+624:letter opener, paper knife, paperknife
+625:library
+626:lifeboat
+627:lighter, light, igniter, ignitor
+628:limousine, limo
+629:liner, ocean liner
+630:lipstick, lip rouge
+631:Loafer
+632:lotion
+633:loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+634:loupe, jeweler's loupe
+635:lumbermill, sawmill
+636:magnetic compass
+637:mailbag, postbag
+638:mailbox, letter box
+639:maillot
+640:maillot, tank suit
+641:manhole cover
+642:maraca
+643:marimba, xylophone
+644:mask
+645:matchstick
+646:maypole
+647:maze, labyrinth
+648:measuring cup
+649:medicine chest, medicine cabinet
+650:megalith, megalithic structure
+651:microphone, mike
+652:microwave, microwave oven
+653:military uniform
+654:milk can
+655:minibus
+656:miniskirt, mini
+657:minivan
+658:missile
+659:mitten
+660:mixing bowl
+661:mobile home, manufactured home
+662:Model T
+663:modem
+664:monastery
+665:monitor
+666:moped
+667:mortar
+668:mortarboard
+669:mosque
+670:mosquito net
+671:motor scooter, scooter
+672:mountain bike, all-terrain bike, off-roader
+673:mountain tent
+674:mouse, computer mouse
+675:mousetrap
+676:moving van
+677:muzzle
+678:nail
+679:neck brace
+680:necklace
+681:nipple
+682:notebook, notebook computer
+683:obelisk
+684:oboe, hautboy, hautbois
+685:ocarina, sweet potato
+686:odometer, hodometer, mileometer, milometer
+687:oil filter
+688:organ, pipe organ
+689:oscilloscope, scope, cathode-ray oscilloscope, CRO
+690:overskirt
+691:oxcart
+692:oxygen mask
+693:packet
+694:paddle, boat paddle
+695:paddlewheel, paddle wheel
+696:padlock
+697:paintbrush
+698:pajama, pyjama, pj's, jammies
+699:palace
+700:panpipe, pandean pipe, syrinx
+701:paper towel
+702:parachute, chute
+703:parallel bars, bars
+704:park bench
+705:parking meter
+706:passenger car, coach, carriage
+707:patio, terrace
+708:pay-phone, pay-station
+709:pedestal, plinth, footstall
+710:pencil box, pencil case
+711:pencil sharpener
+712:perfume, essence
+713:Petri dish
+714:photocopier
+715:pick, plectrum, plectron
+716:pickelhaube
+717:picket fence, paling
+718:pickup, pickup truck
+719:pier
+720:piggy bank, penny bank
+721:pill bottle
+722:pillow
+723:ping-pong ball
+724:pinwheel
+725:pirate, pirate ship
+726:pitcher, ewer
+727:plane, carpenter's plane, woodworking plane
+728:planetarium
+729:plastic bag
+730:plate rack
+731:plow, plough
+732:plunger, plumber's helper
+733:Polaroid camera, Polaroid Land camera
+734:pole
+735:police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+736:poncho
+737:pool table, billiard table, snooker table
+738:pop bottle, soda bottle
+739:pot, flowerpot
+740:potter's wheel
+741:power drill
+742:prayer rug, prayer mat
+743:printer
+744:prison, prison house
+745:projectile, missile
+746:projector
+747:puck, hockey puck
+748:punching bag, punch bag, punching ball, punchball
+749:purse
+750:quill, quill pen
+751:quilt, comforter, comfort, puff
+752:racer, race car, racing car
+753:racket, racquet
+754:radiator
+755:radio, wireless
+756:radio telescope, radio reflector
+757:rain barrel
+758:recreational vehicle, RV, R.V.
+759:reel
+760:reflex camera
+761:refrigerator, icebox
+762:remote control, remote
+763:restaurant, eating house, eating place, eatery
+764:revolver, six-gun, six-shooter
+765:rifle
+766:rocking chair, rocker
+767:rotisserie
+768:rubber eraser, rubber, pencil eraser
+769:rugby ball
+770:rule, ruler
+771:running shoe
+772:safe
+773:safety pin
+774:saltshaker, salt shaker
+775:sandal
+776:sarong
+777:sax, saxophone
+778:scabbard
+779:scale, weighing machine
+780:school bus
+781:schooner
+782:scoreboard
+783:screen, CRT screen
+784:screw
+785:screwdriver
+786:seat belt, seatbelt
+787:sewing machine
+788:shield, buckler
+789:shoe shop, shoe-shop, shoe store
+790:shoji
+791:shopping basket
+792:shopping cart
+793:shovel
+794:shower cap
+795:shower curtain
+796:ski
+797:ski mask
+798:sleeping bag
+799:slide rule, slipstick
+800:sliding door
+801:slot, one-armed bandit
+802:snorkel
+803:snowmobile
+804:snowplow, snowplough
+805:soap dispenser
+806:soccer ball
+807:sock
+808:solar dish, solar collector, solar furnace
+809:sombrero
+810:soup bowl
+811:space bar
+812:space heater
+813:space shuttle
+814:spatula
+815:speedboat
+816:spider web, spider's web
+817:spindle
+818:sports car, sport car
+819:spotlight, spot
+820:stage
+821:steam locomotive
+822:steel arch bridge
+823:steel drum
+824:stethoscope
+825:stole
+826:stone wall
+827:stopwatch, stop watch
+828:stove
+829:strainer
+830:streetcar, tram, tramcar, trolley, trolley car
+831:stretcher
+832:studio couch, day bed
+833:stupa, tope
+834:submarine, pigboat, sub, U-boat
+835:suit, suit of clothes
+836:sundial
+837:sunglass
+838:sunglasses, dark glasses, shades
+839:sunscreen, sunblock, sun blocker
+840:suspension bridge
+841:swab, swob, mop
+842:sweatshirt
+843:swimming trunks, bathing trunks
+844:swing
+845:switch, electric switch, electrical switch
+846:syringe
+847:table lamp
+848:tank, army tank, armored combat vehicle, armoured combat vehicle
+849:tape player
+850:teapot
+851:teddy, teddy bear
+852:television, television system
+853:tennis ball
+854:thatch, thatched roof
+855:theater curtain, theatre curtain
+856:thimble
+857:thresher, thrasher, threshing machine
+858:throne
+859:tile roof
+860:toaster
+861:tobacco shop, tobacconist shop, tobacconist
+862:toilet seat
+863:torch
+864:totem pole
+865:tow truck, tow car, wrecker
+866:toyshop
+867:tractor
+868:trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+869:tray
+870:trench coat
+871:tricycle, trike, velocipede
+872:trimaran
+873:tripod
+874:triumphal arch
+875:trolleybus, trolley coach, trackless trolley
+876:trombone
+877:tub, vat
+878:turnstile
+879:typewriter keyboard
+880:umbrella
+881:unicycle, monocycle
+882:upright, upright piano
+883:vacuum, vacuum cleaner
+884:vase
+885:vault
+886:velvet
+887:vending machine
+888:vestment
+889:viaduct
+890:violin, fiddle
+891:volleyball
+892:waffle iron
+893:wall clock
+894:wallet, billfold, notecase, pocketbook
+895:wardrobe, closet, press
+896:warplane, military plane
+897:washbasin, handbasin, washbowl, lavabo, wash-hand basin
+898:washer, automatic washer, washing machine
+899:water bottle
+900:water jug
+901:water tower
+902:whiskey jug
+903:whistle
+904:wig
+905:window screen
+906:window shade
+907:Windsor tie
+908:wine bottle
+909:wing
+910:wok
+911:wooden spoon
+912:wool, woolen, woollen
+913:worm fence, snake fence, snake-rail fence, Virginia fence
+914:wreck
+915:yawl
+916:yurt
+917:web site, website, internet site, site
+918:comic book
+919:crossword puzzle, crossword
+920:street sign
+921:traffic light, traffic signal, stoplight
+922:book jacket, dust cover, dust jacket, dust wrapper
+923:menu
+924:plate
+925:guacamole
+926:consomme
+927:hot pot, hotpot
+928:trifle
+929:ice cream, icecream
+930:ice lolly, lolly, lollipop, popsicle
+931:French loaf
+932:bagel, beigel
+933:pretzel
+934:cheeseburger
+935:hotdog, hot dog, red hot
+936:mashed potato
+937:head cabbage
+938:broccoli
+939:cauliflower
+940:zucchini, courgette
+941:spaghetti squash
+942:acorn squash
+943:butternut squash
+944:cucumber, cuke
+945:artichoke, globe artichoke
+946:bell pepper
+947:cardoon
+948:mushroom
+949:Granny Smith
+950:strawberry
+951:orange
+952:lemon
+953:fig
+954:pineapple, ananas
+955:banana
+956:jackfruit, jak, jack
+957:custard apple
+958:pomegranate
+959:hay
+960:carbonara
+961:chocolate sauce, chocolate syrup
+962:dough
+963:meat loaf, meatloaf
+964:pizza, pizza pie
+965:potpie
+966:burrito
+967:red wine
+968:espresso
+969:cup
+970:eggnog
+971:alp
+972:bubble
+973:cliff, drop, drop-off
+974:coral reef
+975:geyser
+976:lakeside, lakeshore
+977:promontory, headland, head, foreland
+978:sandbar, sand bar
+979:seashore, coast, seacoast, sea-coast
+980:valley, vale
+981:volcano
+982:ballplayer, baseball player
+983:groom, bridegroom
+984:scuba diver
+985:rapeseed
+986:daisy
+987:yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+988:corn
+989:acorn
+990:hip, rose hip, rosehip
+991:buckeye, horse chestnut, conker
+992:coral fungus
+993:agaric
+994:gyromitra
+995:stinkhorn, carrion fungus
+996:earthstar
+997:hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+998:bolete
+999:ear, spike, capitulum
+1000:toilet tissue, toilet paper, bathroom tissue
\ No newline at end of file
diff --git a/examples/imx500/imx500_classification_demo.py b/examples/imx500/imx500_classification_demo.py
new file mode 100755
index 00000000..1742471a
--- /dev/null
+++ b/examples/imx500/imx500_classification_demo.py
@@ -0,0 +1,156 @@
+import argparse
+import sys
+import time
+from typing import List
+
+import cv2
+import numpy as np
+
+from picamera2 import CompletedRequest, MappedArray, Picamera2
+from picamera2.devices import IMX500
+from picamera2.devices.imx500 import NetworkIntrinsics
+from picamera2.devices.imx500.postprocess import softmax
+
+last_detections = []
+LABELS = None
+
+
+class Classification:
+    def __init__(self, idx: int, score: float):
+        """Create a Classification object, recording the idx and score."""
+        self.idx = idx
+        self.score = score
+
+
+def get_label(request: CompletedRequest, idx: int) -> str:
+    """Retrieve the label corresponding to the classification index."""
+    global LABELS
+    if LABELS is None:
+        LABELS = intrinsics.labels
+        assert len(LABELS) in [1000, 1001], "Labels file should contain 1000 or 1001 labels."
+        output_tensor_size = imx500.get_output_shapes(request.get_metadata())[0][0]
+        if output_tensor_size == 1000:
+            LABELS = LABELS[1:]  # Ignore the background label if present
+    return LABELS[idx]
+
+
+def parse_and_draw_classification_results(request: CompletedRequest):
+    """Analyse and draw the classification results in the output tensor."""
+    results = parse_classification_results(request)
+    draw_classification_results(request, results)
+
+
+def parse_classification_results(request: CompletedRequest) -> List[Classification]:
+    """Parse the output tensor into the classification results above the threshold."""
+    global last_detections
+    np_outputs = imx500.get_outputs(request.get_metadata())
+    if np_outputs is None:
+        return last_detections
+    np_output = np_outputs[0]
+    if intrinsics.softmax:
+        np_output = softmax(np_output)
+    top_indices = np.argpartition(-np_output, 3)[:3]  # Get top 3 indices with the highest scores
+    top_indices = top_indices[np.argsort(-np_output[top_indices])]  # Sort the top 3 indices by their scores
+    last_detections = [Classification(index, np_output[index]) for index in top_indices]
+    return last_detections
+
+
+def draw_classification_results(request: CompletedRequest, results: List[Classification], stream: str = "main"):
+    """Draw the classification results for this request onto the ISP output."""
+    with MappedArray(request, stream) as m:
+        if intrinsics.preserve_aspect_ratio:
+            # Drawing ROI box
+            b_x, b_y, b_w, b_h = imx500.get_roi_scaled(request)
+            color = (255, 0, 0)  # red
+            cv2.putText(m.array, "ROI", (b_x + 5, b_y + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
+            cv2.rectangle(m.array, (b_x, b_y), (b_x + b_w, b_y + b_h), (255, 0, 0, 0))
+            text_left, text_top = b_x, b_y + 20
+        else:
+            text_left, text_top = 0, 0
+        # Drawing labels (in the ROI box if it exists)
+        for index, result in enumerate(results):
+            label = get_label(request, idx=result.idx)
+            text = f"{label}: {result.score:.3f}"
+
+            # Calculate text size and position
+            (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+            text_x = text_left + 5
+            text_y = text_top + 15 + index * 20
+
+            # Create a copy of the array to draw the background with opacity
+            overlay = m.array.copy()
+
+            # Draw the background rectangle on the overlay
+            cv2.rectangle(overlay,
+                          (text_x, text_y - text_height),
+                          (text_x + text_width, text_y + baseline),
+                          (255, 255, 255),  # Background color (white)
+                          cv2.FILLED)
+
+            alpha = 0.3
+            cv2.addWeighted(overlay, alpha, m.array, 1 - alpha, 0, m.array)
+
+            # Draw text on top of the background
+            cv2.putText(m.array, text, (text_x, text_y),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
+
+
+def get_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="Path of the model",
+                        default="/usr/share/imx500-models/imx500_network_mobilenet_v2.rpk")
+    parser.add_argument("--fps", type=int, help="Frames per second")
+    parser.add_argument("-s", "--softmax", action=argparse.BooleanOptionalAction, help="Add post-process softmax")
+    parser.add_argument("-r", "--preserve-aspect-ratio", action=argparse.BooleanOptionalAction,
+                        help="preprocess the image with preserve aspect ratio")
+    parser.add_argument("--labels", type=str,
+                        help="Path to the labels file")
+    parser.add_argument("--print-intrinsics", action="store_true",
+                        help="Print JSON network_intrinsics then exit")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # This must be called before instantiation of Picamera2
+    imx500 = IMX500(args.model)
+    intrinsics = imx500.network_intrinsics
+    if not intrinsics:
+        intrinsics = NetworkIntrinsics()
+        intrinsics.task = "classification"
+    elif intrinsics.task != "classification":
+        print("Network is not a classification task", file=sys.stderr)
+        exit()
+
+    # Override intrinsics from args
+    for key, value in vars(args).items():
+        if key == 'labels' and value is not None:
+            with open(value, 'r') as f:
+                intrinsics.labels = f.read().splitlines()
+        elif hasattr(intrinsics, key) and value is not None:
+            setattr(intrinsics, key, value)
+
+    # Defaults
+    if intrinsics.labels is None:
+        with open("assets/imagenet_labels.txt", "r") as f:
+            intrinsics.labels = f.read().splitlines()
+    intrinsics.update_with_defaults()
+
+    if args.print_intrinsics:
+        print(intrinsics)
+        exit()
+
+    picam2 = Picamera2(imx500.camera_num)
+    config = picam2.create_preview_configuration(controls={"FrameRate": intrinsics.inference_rate}, buffer_count=12)
+
+    imx500.show_network_fw_progress_bar()
+    picam2.start(config, show_preview=True)
+    if intrinsics.preserve_aspect_ratio:
+        imx500.set_auto_aspect_ratio()
+    # Register the callback to parse and draw classification results
+    picam2.pre_callback = parse_and_draw_classification_results
+
+    while True:
+        time.sleep(0.5)
diff --git a/examples/imx500/imx500_object_detection_demo.py b/examples/imx500/imx500_object_detection_demo.py
new file mode 100755
index 00000000..8e868cb5
--- /dev/null
+++ b/examples/imx500/imx500_object_detection_demo.py
@@ -0,0 +1,179 @@
+import argparse
+import sys
+from functools import lru_cache
+
+import cv2
+import numpy as np
+
+from picamera2 import MappedArray, Picamera2
+from picamera2.devices import IMX500
+from picamera2.devices.imx500 import (NetworkIntrinsics,
+                                      postprocess_nanodet_detection)
+
+last_detections = []
+
+
+class Detection:
+    def __init__(self, coords, category, conf, metadata):
+        """Create a Detection object, recording the bounding box, category and confidence."""
+        self.category = category
+        self.conf = conf
+        self.box = imx500.convert_inference_coords(coords, metadata, picam2)
+
+
+def parse_detections(metadata: dict):
+    """Parse the output tensor into a number of detected objects, scaled to the ISP output."""
+    global last_detections
+    bbox_normalization = intrinsics.bbox_normalization
+    bbox_order = intrinsics.bbox_order
+    threshold = args.threshold
+    iou = args.iou
+    max_detections = args.max_detections
+
+    np_outputs = imx500.get_outputs(metadata, add_batch=True)
+    input_w, input_h = imx500.get_input_size()
+    if np_outputs is None:
+        return last_detections
+    if intrinsics.postprocess == "nanodet":
+        boxes, scores, classes = \
+            postprocess_nanodet_detection(outputs=np_outputs[0], conf=threshold, iou_thres=iou,
+                                          max_out_dets=max_detections)[0]
+        from picamera2.devices.imx500.postprocess import scale_boxes
+        boxes = scale_boxes(boxes, 1, 1, input_h, input_w, False, False)
+    else:
+        boxes, scores, classes = np_outputs[0][0], np_outputs[1][0], np_outputs[2][0]
+        if bbox_normalization:
+            boxes = boxes / input_h
+
+        if bbox_order == "xy":
+            boxes = boxes[:, [1, 0, 3, 2]]
+        boxes = np.array_split(boxes, 4, axis=1)
+        boxes = zip(*boxes)
+
+    last_detections = [
+        Detection(box, category, score, metadata)
+        for box, score, category in zip(boxes, scores, classes)
+        if score > threshold
+    ]
+    return last_detections
+
+
+@lru_cache
+def get_labels():
+    labels = intrinsics.labels
+
+    if intrinsics.ignore_dash_labels:
+        labels = [label for label in labels if label and label != "-"]
+    return labels
+
+
+def draw_detections(request, stream="main"):
+    """Draw the detections for this request onto the ISP output."""
+    detections = last_results
+    if detections is None:
+        return
+    labels = get_labels()
+    with MappedArray(request, stream) as m:
+        for detection in detections:
+            x, y, w, h = detection.box
+            label = f"{labels[int(detection.category)]} ({detection.conf:.2f})"
+
+            # Calculate text size and position
+            (text_width, text_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+            text_x = x + 5
+            text_y = y + 15
+
+            # Create a copy of the array to draw the background with opacity
+            overlay = m.array.copy()
+
+            # Draw the background rectangle on the overlay
+            cv2.rectangle(overlay,
+                          (text_x, text_y - text_height),
+                          (text_x + text_width, text_y + baseline),
+                          (255, 255, 255),  # Background color (white)
+                          cv2.FILLED)
+
+            alpha = 0.30
+            cv2.addWeighted(overlay, alpha, m.array, 1 - alpha, 0, m.array)
+
+            # Draw text on top of the background
+            cv2.putText(m.array, label, (text_x, text_y),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
+
+            # Draw detection box
+            cv2.rectangle(m.array, (x, y), (x + w, y + h), (0, 255, 0, 0), thickness=2)
+
+        if intrinsics.preserve_aspect_ratio:
+            b_x, b_y, b_w, b_h = imx500.get_roi_scaled(request)
+            color = (255, 0, 0)  # red
+            cv2.putText(m.array, "ROI", (b_x + 5, b_y + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
+            cv2.rectangle(m.array, (b_x, b_y), (b_x + b_w, b_y + b_h), (255, 0, 0, 0))
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="Path of the model",
+                        default="/usr/share/imx500-models/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk")
+    parser.add_argument("--fps", type=int, help="Frames per second")
+    parser.add_argument("--bbox-normalization", action=argparse.BooleanOptionalAction, help="Normalize bbox")
+    parser.add_argument("--bbox-order", choices=["yx", "xy"], default="yx",
+                        help="Set bbox order yx -> (y0, x0, y1, x1) xy -> (x0, y0, x1, y1)")
+    parser.add_argument("--threshold", type=float, default=0.55, help="Detection threshold")
+    parser.add_argument("--iou", type=float, default=0.65, help="Set iou threshold")
+    parser.add_argument("--max-detections", type=int, default=10, help="Set max detections")
+    parser.add_argument("--ignore-dash-labels", action=argparse.BooleanOptionalAction, help="Remove '-' labels ")
+    parser.add_argument("--postprocess", choices=["", "nanodet"],
+                        default=None, help="Run post process of type")
+    parser.add_argument("-r", "--preserve-aspect-ratio", action=argparse.BooleanOptionalAction,
+                        help="preserve the pixel aspect ratio of the input tensor")
+    parser.add_argument("--labels", type=str,
+                        help="Path to the labels file")
+    parser.add_argument("--print-intrinsics", action="store_true",
+                        help="Print JSON network_intrinsics then exit")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # This must be called before instantiation of Picamera2
+    imx500 = IMX500(args.model)
+    intrinsics = imx500.network_intrinsics
+    if not intrinsics:
+        intrinsics = NetworkIntrinsics()
+        intrinsics.task = "object detection"
+    elif intrinsics.task != "object detection":
+        print("Network is not an object detection task", file=sys.stderr)
+        exit()
+
+    # Override intrinsics from args
+    for key, value in vars(args).items():
+        if key == 'labels' and value is not None:
+            with open(value, 'r') as f:
+                intrinsics.labels = f.read().splitlines()
+        elif hasattr(intrinsics, key) and value is not None:
+            setattr(intrinsics, key, value)
+
+    # Defaults
+    if intrinsics.labels is None:
+        with open("assets/coco_labels.txt", "r") as f:
+            intrinsics.labels = f.read().splitlines()
+    intrinsics.update_with_defaults()
+
+    if args.print_intrinsics:
+        print(intrinsics)
+        exit()
+
+    picam2 = Picamera2(imx500.camera_num)
+    config = picam2.create_preview_configuration(controls={"FrameRate": intrinsics.inference_rate}, buffer_count=12)
+
+    imx500.show_network_fw_progress_bar()
+    picam2.start(config, show_preview=True)
+
+    if intrinsics.preserve_aspect_ratio:
+        imx500.set_auto_aspect_ratio()
+
+    last_results = None
+    picam2.pre_callback = draw_detections
+    while True:
+        last_results = parse_detections(picam2.capture_metadata())
diff --git a/examples/imx500/imx500_object_detection_demo_mp.py b/examples/imx500/imx500_object_detection_demo_mp.py
new file mode 100755
index 00000000..9bc5bd49
--- /dev/null
+++ b/examples/imx500/imx500_object_detection_demo_mp.py
@@ -0,0 +1,194 @@
+import argparse
+import multiprocessing
+import queue
+import sys
+import threading
+from functools import lru_cache
+
+import cv2
+import numpy as np
+
+from picamera2 import MappedArray, Picamera2
+from picamera2.devices import IMX500
+from picamera2.devices.imx500 import (NetworkIntrinsics,
+                                      postprocess_nanodet_detection)
+
+
+class Detection:
+    def __init__(self, coords, category, conf, metadata):
+        """Create a Detection object, recording the bounding box, category and confidence."""
+        self.category = category
+        self.conf = conf
+        self.box = imx500.convert_inference_coords(coords, metadata, picam2)
+
+
+def parse_detections(metadata: dict):
+    """Parse the output tensor into a number of detected objects, scaled to the ISP output."""
+    bbox_normalization = intrinsics.bbox_normalization
+    threshold = args.threshold
+    iou = args.iou
+    max_detections = args.max_detections
+
+    np_outputs = imx500.get_outputs(metadata, add_batch=True)
+    input_w, input_h = imx500.get_input_size()
+    if np_outputs is None:
+        return None
+    if intrinsics.postprocess == "nanodet":
+        boxes, scores, classes = \
+            postprocess_nanodet_detection(outputs=np_outputs[0], conf=threshold, iou_thres=iou,
+                                          max_out_dets=max_detections)[0]
+        from picamera2.devices.imx500.postprocess import scale_boxes
+        boxes = scale_boxes(boxes, 1, 1, input_h, input_w, False, False)
+    else:
+        boxes, scores, classes = np_outputs[0][0], np_outputs[1][0], np_outputs[2][0]
+        if bbox_normalization:
+            boxes = boxes / input_h
+
+        boxes = np.array_split(boxes, 4, axis=1)
+        boxes = zip(*boxes)
+
+    detections = [
+        Detection(box, category, score, metadata)
+        for box, score, category in zip(boxes, scores, classes)
+        if score > threshold
+    ]
+    return detections
+
+
+@lru_cache
+def get_labels():
+    labels = intrinsics.labels
+
+    if intrinsics.ignore_dash_labels:
+        labels = [label for label in labels if label and label != "-"]
+    return labels
+
+
+def draw_detections(jobs):
+    """Draw the detections for this request onto the ISP output."""
+    labels = get_labels()
+    # Wait for result from child processes in the order submitted.
+    last_detections = []
+    while (job := jobs.get()) is not None:
+        request, async_result = job
+        detections = async_result.get()
+        if detections is None:
+            detections = last_detections
+        last_detections = detections
+        with MappedArray(request, 'main') as m:
+            for detection in detections:
+                x, y, w, h = detection.box
+                label = f"{labels[int(detection.category)]} ({detection.conf:.2f})"
+
+                # Calculate text size and position
+                (text_width, text_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+                text_x = x + 5
+                text_y = y + 15
+
+                # Create a copy of the array to draw the background with opacity
+                overlay = m.array.copy()
+
+                # Draw the background rectangle on the overlay
+                cv2.rectangle(overlay,
+                              (text_x, text_y - text_height),
+                              (text_x + text_width, text_y + baseline),
+                              (255, 255, 255),  # Background color (white)
+                              cv2.FILLED)
+
+                alpha = 0.3
+                cv2.addWeighted(overlay, alpha, m.array, 1 - alpha, 0, m.array)
+
+                # Draw text on top of the background
+                cv2.putText(m.array, label, (text_x, text_y),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
+
+                # Draw detection box
+                cv2.rectangle(m.array, (x, y), (x + w, y + h), (0, 255, 0), thickness=2)
+
+            if intrinsics.preserve_aspect_ratio:
+                b_x, b_y, b_w, b_h = imx500.get_roi_scaled(request)
+                color = (255, 0, 0)  # red
+                cv2.putText(m.array, "ROI", (b_x + 5, b_y + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
+                cv2.rectangle(m.array, (b_x, b_y), (b_x + b_w, b_y + b_h), (255, 0, 0, 0))
+
+            cv2.imshow('IMX500 Object Detection', m.array)
+            cv2.waitKey(1)
+        request.release()
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="Path of the model",
+                        default="/usr/share/imx500-models/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk")
+    parser.add_argument("--fps", type=int, help="Frames per second")
+    parser.add_argument("--bbox-normalization", action=argparse.BooleanOptionalAction, help="Normalize bbox")
+    parser.add_argument("--threshold", type=float, default=0.55, help="Detection threshold")
+    parser.add_argument("--iou", type=float, default=0.65, help="Set iou threshold")
+    parser.add_argument("--max-detections", type=int, default=10, help="Set max detections")
+    parser.add_argument("--ignore-dash-labels", action=argparse.BooleanOptionalAction, help="Remove '-' labels ")
+    parser.add_argument("--postprocess", choices=["", "nanodet"],
+                        default=None, help="Run post process of type")
+    parser.add_argument("-r", "--preserve-aspect-ratio", action=argparse.BooleanOptionalAction,
+                        help="preserve the pixel aspect ratio of the input tensor")
+    parser.add_argument("--labels", type=str,
+                        help="Path to the labels file")
+    parser.add_argument("--print-intrinsics", action="store_true",
+                        help="Print JSON network_intrinsics then exit")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # This must be called before instantiation of Picamera2
+    imx500 = IMX500(args.model)
+    intrinsics = imx500.network_intrinsics
+    if not intrinsics:
+        intrinsics = NetworkIntrinsics()
+        intrinsics.task = "object detection"
+    elif intrinsics.task != "object detection":
+        print("Network is not an object detection task", file=sys.stderr)
+        exit()
+
+    # Override intrinsics from args
+    for key, value in vars(args).items():
+        if key == 'labels' and value is not None:
+            with open(value, 'r') as f:
+                intrinsics.labels = f.read().splitlines()
+        elif hasattr(intrinsics, key) and value is not None:
+            setattr(intrinsics, key, value)
+
+    # Defaults
+    if intrinsics.labels is None:
+        with open("assets/coco_labels.txt", "r") as f:
+            intrinsics.labels = f.read().splitlines()
+    intrinsics.update_with_defaults()
+
+    if args.print_intrinsics:
+        print(intrinsics)
+        exit()
+
+    picam2 = Picamera2(imx500.camera_num)
+    main = {'format': 'RGB888'}
+    config = picam2.create_preview_configuration(main, controls={"FrameRate": intrinsics.inference_rate}, buffer_count=12)
+
+    imx500.show_network_fw_progress_bar()
+    picam2.start(config, show_preview=False)
+    if intrinsics.preserve_aspect_ratio:
+        imx500.set_auto_aspect_ratio()
+
+    pool = multiprocessing.Pool(processes=4)
+    jobs = queue.Queue()
+
+    thread = threading.Thread(target=draw_detections, args=(jobs,))
+    thread.start()
+
+    while True:
+        # The request gets released by handle_results
+        request = picam2.capture_request()
+        metadata = request.get_metadata()
+        if metadata:
+            async_result = pool.apply_async(parse_detections, (metadata,))
+            jobs.put((request, async_result))
+        else:
+            request.release()
diff --git a/examples/imx500/imx500_pose_estimation_higherhrnet_demo.py b/examples/imx500/imx500_pose_estimation_higherhrnet_demo.py
new file mode 100755
index 00000000..2b754e46
--- /dev/null
+++ b/examples/imx500/imx500_pose_estimation_higherhrnet_demo.py
@@ -0,0 +1,117 @@
+import argparse
+import sys
+import time
+
+import numpy as np
+
+from picamera2 import CompletedRequest, MappedArray, Picamera2
+from picamera2.devices.imx500 import IMX500, NetworkIntrinsics
+from picamera2.devices.imx500.postprocess import COCODrawer
+from picamera2.devices.imx500.postprocess_highernet import \
+    postprocess_higherhrnet
+
+last_boxes = None
+last_scores = None
+last_keypoints = None
+WINDOW_SIZE_H_W = (480, 640)
+
+
+def ai_output_tensor_parse(metadata: dict):
+    """Parse the output tensor into a number of detected objects, scaled to the ISP output."""
+    global last_boxes, last_scores, last_keypoints
+    np_outputs = imx500.get_outputs(metadata=metadata, add_batch=True)
+    if np_outputs is not None:
+        keypoints, scores, boxes = postprocess_higherhrnet(outputs=np_outputs,
+                                                           img_size=WINDOW_SIZE_H_W,
+                                                           img_w_pad=(0, 0),
+                                                           img_h_pad=(0, 0),
+                                                           detection_threshold=args.detection_threshold,
+                                                           network_postprocess=True)
+
+        if scores is not None and len(scores) > 0:
+            last_keypoints = np.reshape(np.stack(keypoints, axis=0), (len(scores), 17, 3))
+            last_boxes = [np.array(b) for b in boxes]
+            last_scores = np.array(scores)
+    return last_boxes, last_scores, last_keypoints
+
+
+def ai_output_tensor_draw(request: CompletedRequest, boxes, scores, keypoints, stream='main'):
+    """Draw the detections for this request onto the ISP output."""
+    with MappedArray(request, stream) as m:
+        if boxes is not None and len(boxes) > 0:
+            drawer.annotate_image(m.array, boxes, scores,
+                                  np.zeros(scores.shape), keypoints, args.detection_threshold,
+                                  args.detection_threshold, request.get_metadata(), picam2, stream)
+
+
+def picamera2_pre_callback(request: CompletedRequest):
+    """Analyse the detected objects in the output tensor and draw them on the main output image."""
+    boxes, scores, keypoints = ai_output_tensor_parse(request.get_metadata())
+    ai_output_tensor_draw(request, boxes, scores, keypoints)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="Path of the model",
+                        default="/usr/share/imx500-models/imx500_network_higherhrnet_coco.rpk")
+    parser.add_argument("--fps", type=int, help="Frames per second")
+    parser.add_argument("--detection-threshold", type=float, default=0.3,
+                        help="Post-process detection threshold")
+    parser.add_argument("--labels", type=str,
+                        help="Path to the labels file")
+    parser.add_argument("--print-intrinsics", action="store_true",
+                        help="Print JSON network_intrinsics then exit")
+    return parser.parse_args()
+
+
+def get_drawer():
+    categories = intrinsics.labels
+    categories = [c for c in categories if c and c != "-"]
+    return COCODrawer(categories, imx500, needs_rescale_coords=False)
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # This must be called before instantiation of Picamera2
+    imx500 = IMX500(args.model)
+    intrinsics = imx500.network_intrinsics
+    if not intrinsics:
+        intrinsics = NetworkIntrinsics()
+        intrinsics.task = "pose estimation"
+    elif intrinsics.task != "pose estimation":
+        print("Network is not a pose estimation task", file=sys.stderr)
+        exit()
+
+    # Override intrinsics from args
+    for key, value in vars(args).items():
+        if key == 'labels' and value is not None:
+            with open(value, 'r') as f:
+                intrinsics.labels = f.read().splitlines()
+        elif hasattr(intrinsics, key) and value is not None:
+            setattr(intrinsics, key, value)
+
+    # Defaults
+    if intrinsics.inference_rate is None:
+        intrinsics.inference_rate = 10
+    if intrinsics.labels is None:
+        with open("assets/coco_labels.txt", "r") as f:
+            intrinsics.labels = f.read().splitlines()
+    intrinsics.update_with_defaults()
+
+    if args.print_intrinsics:
+        print(intrinsics)
+        exit()
+
+    drawer = get_drawer()
+
+    picam2 = Picamera2(imx500.camera_num)
+    config = picam2.create_preview_configuration(controls={'FrameRate': intrinsics.inference_rate}, buffer_count=12)
+
+    imx500.show_network_fw_progress_bar()
+    picam2.start(config, show_preview=True)
+    imx500.set_auto_aspect_ratio()
+    picam2.pre_callback = picamera2_pre_callback
+
+    while True:
+        time.sleep(0.5)
diff --git a/examples/imx500/imx500_segmentation_demo.py b/examples/imx500/imx500_segmentation_demo.py
new file mode 100755
index 00000000..64263508
--- /dev/null
+++ b/examples/imx500/imx500_segmentation_demo.py
@@ -0,0 +1,101 @@
+import argparse
+import sys
+import time
+from typing import Dict
+
+import numpy as np
+
+from picamera2 import CompletedRequest, Picamera2
+from picamera2.devices import IMX500
+from picamera2.devices.imx500 import NetworkIntrinsics
+
+COLOURS = np.loadtxt("assets/colours.txt")
+
+
+def create_and_draw_masks(request: CompletedRequest):
+    """Create masks from the output tensor and draw them on the main output image."""
+    masks = create_masks(request)
+    draw_masks(masks)
+
+
+def create_masks(request: CompletedRequest) -> Dict[int, np.ndarray]:
+    """Create masks from the output tensor, scaled to the ISP output."""
+    res = {}
+    np_outputs = imx500.get_outputs(metadata=request.get_metadata())
+    input_w, input_h = imx500.get_input_size()
+    if np_outputs is None:
+        return res
+    mask = np_outputs[0]
+    found_indices = np.unique(mask)
+
+    for i in found_indices:
+        if i == 0:
+            continue
+        output_shape = [input_h, input_w, 4]
+        colour = [(0, 0, 0, 0), COLOURS[int(i)]]
+        colour[1][3] = 150  # update the alpha value here, to save setting it later
+        overlay = np.array(mask == i, dtype=np.uint8)
+        overlay = np.array(colour)[overlay].reshape(output_shape).astype(np.uint8)
+        # No need to resize the overlay, it will be stretched to the output window.
+        res[i] = overlay
+    return res
+
+
+def draw_masks(masks: Dict[int, np.ndarray]):
+    """Draw the masks for this request onto the ISP output."""
+    if not masks:
+        return
+    input_w, input_h = imx500.get_input_size()
+    output_shape = [input_h, input_w, 4]
+    overlay = np.zeros(output_shape, dtype=np.uint8)
+    if masks:
+        for v in masks.values():
+            overlay += v
+        # Set Alphas and overlay
+        picam2.set_overlay(overlay)
+
+
+def get_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="Path of the model",
+                        default="/usr/share/imx500-models/imx500_network_deeplabv3plus.rpk")
+    parser.add_argument("--fps", type=int, help="Frames per second")
+    parser.add_argument("--print-intrinsics", action="store_true",
+                        help="Print JSON network_intrinsics then exit")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # This must be called before instantiation of Picamera2
+    imx500 = IMX500(args.model)
+    intrinsics = imx500.network_intrinsics
+    if not intrinsics:
+        intrinsics = NetworkIntrinsics()
+        intrinsics.task = "segmentation"
+    elif intrinsics.task != "segmentation":
+        print("Network is not a segmentation task", file=sys.stderr)
+        exit()
+
+    # Override intrinsics from args
+    for key, value in vars(args).items():
+        if hasattr(intrinsics, key) and value is not None:
+            setattr(intrinsics, key, value)
+
+    # Defaults
+    intrinsics.update_with_defaults()
+
+    if args.print_intrinsics:
+        print(intrinsics)
+        exit()
+
+    picam2 = Picamera2(imx500.camera_num)
+    config = picam2.create_preview_configuration(controls={'FrameRate': intrinsics.inference_rate}, buffer_count=12)
+    imx500.show_network_fw_progress_bar()
+    picam2.start(config, show_preview=True)
+    picam2.pre_callback = create_and_draw_masks
+
+    while True:
+        time.sleep(0.5)
diff --git a/examples/picamera2_multiprocessing.py b/examples/picamera2_multiprocessing.py
new file mode 100755
index 00000000..bcdcff54
--- /dev/null
+++ b/examples/picamera2_multiprocessing.py
@@ -0,0 +1,243 @@
+#!/usr/bin/python3
+
+# A demonstration of how to pass image buffers to other Python processes, using the
+# dmabuf file descriptors so as to avoid copying all the pixel data.
+
+import mmap
+import multiprocessing as mp
+import os
+import queue
+from collections import deque
+from concurrent.futures import Future
+from ctypes import CDLL, c_int, c_long, get_errno
+from threading import Thread
+
+import numpy as np
+
+
+class Process(mp.Process):
+    """A separate process for multi-processing that receives shared camera frames from Picamera2."""
+
+    def __init__(self, picam2, name='main', *args, **kwargs):
+        """Create a Picamera2 child process. Call after Picamera2 has been configured.
+
+        Arguments:
+            picam2 - the Picamera2 object
+            name - the name of the stream whose images are to be passed to the child process
+        """
+        super().__init__(*args, **kwargs)
+        self.config = picam2.camera_configuration()[name]
+        self._picam2_pid = os.getpid()
+        self._pid_fd = None
+        self._send_queue = mp.Queue()
+        self._return_queue = mp.Queue()
+        self._arrays = {}
+        self._return_result = False
+        self._syscall = CDLL(None, use_errno=True).syscall
+        self._syscall.argtypes = [c_long]
+        self.start()
+        self._stream = picam2.stream_map[name]
+        self._requests_sent = deque()
+        self._thread = Thread(target=self._return_thread, args=())
+        self._thread.start()
+
+    def _return_thread(self):
+        # Runs in a thread in the Picamera2 process to return requests to libcamera.
+        while True:
+            result = self._return_queue.get()  # requests are finished with in the order we sent them
+            if not bool(self._requests_sent):
+                break  # we get a reply but with no request sent when we're closing down
+            request, future = self._requests_sent.popleft()
+            future.set_result(result)
+            request.release()
+
+    def send(self, request, *args):
+        """Call from the Picamera2 process to send an image from this request to the child process.
+
+        Arguments:
+            request - the request from which the image is passed to the child process
+            args - optional extra parameters that are passed across with the image
+
+        Returns a future which the caller can optionally wait on to get the child process's result.
+        """
+        plane = request.request.buffers[self._stream].planes[0]
+        fd = plane.fd
+        length = plane.length
+        future = Future()
+        request.acquire()
+        self._requests_sent.append((request, future))
+        self._send_queue.put((fd, length, args))
+        return future
+
+    def _format_array(self, mem):
+        # Format the memory buffer into a numpy image array.
+        array = np.array(mem, copy=False, dtype=np.uint8)
+        width, height = self.config['size']
+        stride = self.config['stride']
+        format = self.config['format']
+        if format == 'YUV420':
+            return array.reshape((height + height // 2, stride))
+        array = array.reshape((height, stride))
+        if format in ('RGB888', 'BGR888'):
+            return array[:, :width * 3].reshape((height, width, 3))
+        elif format in ("XBGR8888", "XRGB8888"):
+            return array[:, :width * 4].reshape((height, width, 4))
+        return array
+
+    def _map_fd(self, picam2_fd):
+        # Map the Picamera2 process's fd to our own. Strictly speaking you don't need this if
+        # Picamera2 has already allocated the buffers before it gets forked. But it can be hard
+        # to know and there should be no great harm in doing this anyway.
+        if self._pid_fd is None:
+            self._pid_fd = os.pidfd_open(self._picam2_pid)
+        fd = self._syscall(438, c_int(self._pid_fd), c_int(picam2_fd), c_int(0))  # 438 is pidfd_getfd
+        if fd == -1:
+            errno = get_errno()
+            raise OSError(errno, os.strerror(errno))
+        return fd
+
+    def capture_shared_array(self):
+        """Call from the child process to wait for a shared image array from the Picamera2 process.
+
+        Once the image is received, self.args will contain any parameters that were sent with it.
+        Returns the numpy image array, or None if we are being shut down and must quit.
+        """
+        # Tell the Picamera2 process (if we haven't already) that we're done with the previous image.
+        if self._return_result:
+            self._return_queue.put(None)
+        self._return_result = True
+        # Wait for the next image. A "CLOSE" message means they're shutting us down.
+        msg = self._send_queue.get()
+        if msg == "CLOSE":
+            self._return_queue.put(None)
+            return None
+        # We have a new buffer. The message contains Picamera2's fd, the buffer length and arguments.
+        picam2_fd, length, self.args = msg
+        if picam2_fd in self._arrays:  # have we seen this buffer before?
+            return self._arrays[picam2_fd]
+        # Otherwise create a local fd, and mmap it to create a numpy image array.
+        fd = self._map_fd(picam2_fd)
+        mem = mmap.mmap(fd, length, mmap.MAP_SHARED, mmap.PROT_READ)
+        array = self._format_array(mem)
+        self._arrays[picam2_fd] = array
+        return array
+
+    def set_result(self, result):
+        """Call from the child process to return a result to the Picamera2 process.
+
+        In turn, this will cause the Picamera2 process to release the request back to libcamera.
+        Calling this is optional; if you don't, the next call to capture_shared_array will dispose
+        of the image anyway.
+        """
+        self._return_result = False
+        self._return_queue.put(result)
+
+    def run(self):
+        """Derived classes should override this to define what the child process does."""
+        pass
+
+    def close(self):
+        """Call from the Picamera2 process to close the child process."""
+        self._send_queue.put("CLOSE")
+        self._thread.join()
+        self.join()
+        super().close()
+
+
+# The multi-processing module has a Pool class, though I can't see how to make it run my
+# own derived Process instances. Maybe I've missed something. Anyhow, here follows a
+# simple-minded implementation thereof.
+
+class Pool:
+    """A pool of Picamera2 child processes to which tasks can be sent."""
+
+    def __init__(self, num_processes, process, picam2, name='main', maxsize=0, *args, **kwargs):
+        """Create a Picamera2 child process pool."""
+        self._processes = [process(picam2, name, *args, **kwargs) for _ in range(num_processes)]
+        self._futures = queue.Queue(maxsize=maxsize)
+        self._count = 0
+        for p in self._processes:
+            p._count = 0
+        self._thread = Thread(target=self._handle_thread, args=())
+        self._thread.start()
+
+    def send(self, request, *args):
+        """Call from the Picamera2 process to send an image to one of the pool's child processes.
+
+        Arguments: as per Process.send.
+        Returns nothing. The child process's return value will be passed to handle_result.
+        """
+        # Choose the process with least pending work to do, and the LRU among those.
+        process = min(self._processes, key=lambda p: (len(p._requests_sent), p._count))
+        self._count += 1
+        process._count = self._count
+        self._futures.put(process.send(request, *args))
+
+    def _handle_thread(self):
+        # Thread in the Picamera2 process to wait for and handle child process results.
+        while True:
+            future = self._futures.get()
+            if future is None:  # happens when we're being closed
+                break
+            self.handle_result(future.result())
+
+    def handle_result(self, result):
+        """Derived classes should override this to define what to do with the child process results."""
+        pass
+
+    def close(self):
+        """Call from the Picamera2 process to close the pool and all the child processes."""
+        for p in self._processes:
+            p.close()
+        self._futures.put(None)
+        self._thread.join()
+
+
+# Below here is all demo/test code.
+
+if __name__ == "__main__":
+    # Simple test showing how to use the Process class.
+    from picamera2 import Picamera2
+
+    class MyProcess(Process):
+        def run(self):
+            while (array := self.capture_shared_array()) is not None:
+                print(array.shape, self.args)
+                self.set_result(self.args[0])  # send back the parameter we were given!
+
+    picam2 = Picamera2()
+    config = picam2.create_preview_configuration({'format': 'RGB888'})
+    picam2.start(config)
+
+    process = MyProcess(picam2, 'main')  # send images from the "main" stream to the child process
+
+    for _ in range(50):
+        with picam2.captured_request() as request:
+            exposure_time = request.get_metadata()['ExposureTime']
+            future = process.send(request, exposure_time)
+            if exposure_time != future.result():
+                print("ERROR: exposure time has come back different!")
+
+    process.close()
+
+    # Here's a similar thing using a Pool, which starts 4 other processes.
+    import time
+
+    class MyProcess2(Process):
+        def run(self):
+            while self.capture_shared_array() is not None:
+                print("Received:", self.args[0])
+                time.sleep(0.05)
+                self.set_result(self.args[0])  # after a delay, return the parameter we were given
+
+    class MyPool(Pool):
+        def handle_result(self, result):
+            print("Finished:", result)
+
+    pool = MyPool(num_processes=4, process=MyProcess2, picam2=picam2, name='main', maxsize=10)
+
+    for i in range(50):
+        with picam2.captured_request() as request:
+            pool.send(request, i)
+
+    pool.close()
diff --git a/examples/pyav_capture.py b/examples/pyav_capture.py
new file mode 100755
index 00000000..9986e472
--- /dev/null
+++ b/examples/pyav_capture.py
@@ -0,0 +1,21 @@
+#!/usr/bin/python3
+
+# Example using PyavOutput to record to an mp4 file.
+
+import time
+
+from picamera2 import Picamera2
+from picamera2.encoders import H264Encoder
+from picamera2.outputs import PyavOutput
+
+picam2 = Picamera2()
+config = picam2.create_video_configuration({'size': (1280, 720), 'format': 'YUV420'})
+picam2.configure(config)
+
+encoder = H264Encoder(bitrate=10000000)
+output = PyavOutput("test.mp4")
+picam2.start_recording(encoder, output)
+
+time.sleep(5)
+
+picam2.stop_recording()
diff --git a/examples/pyav_circular_capture.py b/examples/pyav_circular_capture.py
new file mode 100755
index 00000000..7f327bf6
--- /dev/null
+++ b/examples/pyav_circular_capture.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python3
+
+# Example using PyavOutput through a circular buffer to capture files.
+
+import time
+
+from picamera2 import Picamera2
+from picamera2.encoders import H264Encoder
+from picamera2.outputs import CircularOutput2, PyavOutput
+
+picam2 = Picamera2()
+config = picam2.create_video_configuration({'size': (1280, 720), 'format': 'YUV420'})
+picam2.configure(config)
+
+encoder = H264Encoder(bitrate=10000000)
+circular = CircularOutput2(buffer_duration_ms=5000)
+picam2.start_recording(encoder, circular)
+
+time.sleep(5)
+
+# This will capture the video from "buffer_duration_ms" (5 seconds) ago.
+circular.open_output(PyavOutput("test1.mp4"))
+time.sleep(5)
+circular.close_output()
+
+# Because this is not closed when we circular buffer stops, the remaining 5 seconds
+# will get flushed into here.
+circular.open_output(PyavOutput("test2.mp4"))
+picam2.stop_recording()
diff --git a/examples/pyav_stream.py b/examples/pyav_stream.py
new file mode 100755
index 00000000..74282803
--- /dev/null
+++ b/examples/pyav_stream.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python3
+
+# Example using PyavOutput to serve an MPEG2 transport stream to TCP connections.
+# Just point a stream playher at tcp://<Pi-ip-address>:8888
+
+from threading import Event
+
+from picamera2 import Picamera2
+from picamera2.encoders import H264Encoder
+from picamera2.outputs import PyavOutput
+
+picam2 = Picamera2()
+config = picam2.create_video_configuration({'size': (1280, 720), 'format': 'YUV420'})
+picam2.configure(config)
+
+event = Event()
+
+
+def callback(e):
+    event.set()
+
+
+while True:
+    encoder = H264Encoder(bitrate=10000000)
+    output = PyavOutput("tcp://0.0.0.0:8888\?listen=1", format="mpegts")  # noqa
+    output.error_callback = callback
+    picam2.start_recording(encoder, output)
+
+    event.wait()
+    event.clear()
+    print("Client disconnected")
+
+    picam2.stop_recording()
diff --git a/examples/pyav_stream2.py b/examples/pyav_stream2.py
new file mode 100755
index 00000000..d0c45743
--- /dev/null
+++ b/examples/pyav_stream2.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python3
+
+import socket
+from threading import Event
+
+from picamera2 import Picamera2
+from picamera2.encoders import H264Encoder
+from picamera2.outputs import PyavOutput
+
+picam2 = Picamera2()
+video_config = picam2.create_video_configuration({"size": (1280, 720), 'format': 'YUV420'})
+picam2.configure(video_config)
+
+encoder = H264Encoder(bitrate=10000000)
+encoder.audio = True
+
+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind(("0.0.0.0", 8888))
+
+    while True:
+        print("Waiting")
+        sock.listen()
+
+        conn, addr = sock.accept()
+        print("Connected")
+
+        output = PyavOutput(f"pipe:{conn.fileno()}", format="mpegts")
+        event = Event()
+        output.error_callback = lambda e: event.set()  # noqa
+
+        picam2.start_recording(encoder, output)
+
+        event.wait()
+        print("Disconnected")
+
+        picam2.stop_recording()
diff --git a/examples/request_context_manager.py b/examples/request_context_manager.py
new file mode 100755
index 00000000..ffb30540
--- /dev/null
+++ b/examples/request_context_manager.py
@@ -0,0 +1,15 @@
+#!/usr/bin/python3
+
+# Demonstrate use of a context manager with "captured_request()". This is convenient because
+# requests are released automatically for you.
+
+from picamera2 import Picamera2
+
+with Picamera2() as picam2:
+    picam2.start()
+
+    for _ in range(25):
+        with picam2.captured_request() as request:
+            print(request)
+        with picam2.captured_request(flush=True) as request:
+            print(request)
diff --git a/examples/stereo_preview.py b/examples/stereo_preview.py
new file mode 100755
index 00000000..8f416a27
--- /dev/null
+++ b/examples/stereo_preview.py
@@ -0,0 +1,73 @@
+#!/usr/bin/python3
+
+import time
+from threading import Lock
+
+from picamera2 import MappedArray, Picamera2, libcamera
+
+cam2_request = None
+lock = Lock()
+
+
+def pre_callback(request):
+    # Set the size, to make preview window and MappedArray remapping work
+    request.config["main"]["size"] = full_size
+    request.stream_map["main"].configuration.size = libcamera.Size(*full_size)
+
+
+def copy_image(request):
+    global cam2_request
+    with lock:
+        request_2 = cam2_request
+        if request_2 is None:
+            return
+        request_2.acquire()
+    # Copy second image into right hand side of main image
+    with MappedArray(request, "main") as m1, MappedArray(request_2, "main") as m2:
+        a1 = m1.array
+        a2 = m2.array
+        a1[:, -a2.shape[1]:] = a2
+    request_2.release()
+
+
+def save_request(request):
+    # Store most recent request for use by other camera
+    global cam2_request
+    with lock:
+        if cam2_request is not None:
+            cam2_request.release()
+        request.acquire()
+        cam2_request = request
+
+
+picam2a = Picamera2(0)
+
+full_size = (1920, 1080)
+half_size = (full_size[0] // 2, full_size[1])
+# Calculate stride for full frame
+full_config = picam2a.create_preview_configuration({"size": full_size})
+picam2a.configure(full_config)
+stride = picam2a.camera_config["main"]["stride"]
+
+# Configure as half frame, with full frame stride so right side is blank
+picam2a.pre_callback = pre_callback
+picam2a.post_callback = copy_image
+main_config = picam2a.create_preview_configuration(
+    main={"size": half_size, "stride": stride},
+    controls={"ScalerCrop": (0, 0, picam2a.sensor_resolution[0], picam2a.sensor_resolution[1])}
+)
+picam2a.configure(main_config)
+picam2a.start_preview(True)
+
+# Configure as half frame normally
+picam2b = Picamera2(1)
+picam2b.pre_callback = save_request
+half_config = picam2a.create_preview_configuration(
+    main={"size": half_size},
+    controls={"ScalerCrop": (0, 0, picam2a.sensor_resolution[0], picam2a.sensor_resolution[1])}
+)
+picam2b.configure(half_config)
+
+picam2a.start()
+picam2b.start()
+time.sleep(10)
diff --git a/examples/tensorflow/remove_background.py b/examples/tensorflow/remove_background.py
index 0e39b3c5..68c09328 100755
--- a/examples/tensorflow/remove_background.py
+++ b/examples/tensorflow/remove_background.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-# Usage: ./remove_background.py --model deeplapv3.tflite --background thing.png
+# Usage: ./remove_background.py --model deeplabv3.tflite --background thing.png
 
 import argparse
 
diff --git a/picamera2/__init__.py b/picamera2/__init__.py
index bbb4992f..e544081d 100644
--- a/picamera2/__init__.py
+++ b/picamera2/__init__.py
@@ -1,10 +1,12 @@
 import os
+from concurrent.futures import TimeoutError
 
 import libcamera
 
 from .configuration import CameraConfiguration, StreamConfiguration
 from .controls import Controls
 from .converters import YUV420_to_RGB
+from .job import CancelledError
 from .metadata import Metadata
 from .picamera2 import Picamera2, Preview
 from .platform import Platform, get_platform
@@ -50,3 +52,17 @@ def libcamera_colour_spaces_eq(c1, c2):
 
 libcamera.ColorSpace.__repr__ = libcamera.ColorSpace.__str__
 libcamera.ColorSpace.__eq__ = libcamera_colour_spaces_eq
+
+
+def _libcamera_size_to_tuple(sz):
+    return (sz.width, sz.height)
+
+
+libcamera.Size.to_tuple = _libcamera_size_to_tuple
+
+
+def _libcamera_rect_to_tuple(rect):
+    return (rect.x, rect.y, rect.width, rect.height)
+
+
+libcamera.Rectangle.to_tuple = _libcamera_rect_to_tuple
diff --git a/picamera2/configuration.py b/picamera2/configuration.py
index 7c6a3f2d..9c01794b 100644
--- a/picamera2/configuration.py
+++ b/picamera2/configuration.py
@@ -83,7 +83,7 @@ def align(self, optimal=True):
 
 
 class StreamConfiguration(Configuration):
-    _ALLOWED_FIELDS = ("size", "format", "stride", "framesize")
+    _ALLOWED_FIELDS = ("size", "format", "stride", "framesize", "preserve_ar")
     _FIELD_CLASS_MAP = {}
     _FORWARD_FIELDS = {}
 
diff --git a/picamera2/devices/__init__.py b/picamera2/devices/__init__.py
new file mode 100644
index 00000000..5ef65f35
--- /dev/null
+++ b/picamera2/devices/__init__.py
@@ -0,0 +1,7 @@
+try:
+    # Hailo requires hailo_platform package, which may not be installed on non-Hailo platforms.
+    from .hailo import Hailo
+except ModuleNotFoundError:
+    pass
+from .imx500 import IMX500
+from .imx708 import IMX708
diff --git a/picamera2/devices/hailo/__init__.py b/picamera2/devices/hailo/__init__.py
new file mode 100644
index 00000000..abebb8c3
--- /dev/null
+++ b/picamera2/devices/hailo/__init__.py
@@ -0,0 +1 @@
+from .hailo import Hailo
diff --git a/picamera2/devices/hailo/hailo.py b/picamera2/devices/hailo/hailo.py
new file mode 100644
index 00000000..dbc3d437
--- /dev/null
+++ b/picamera2/devices/hailo/hailo.py
@@ -0,0 +1,186 @@
+from concurrent.futures import Future
+from functools import partial
+
+import numpy as np
+from hailo_platform import HEF, FormatType, HailoSchedulingAlgorithm, VDevice
+
+
+class Hailo:
+    TARGET = None
+    TARGET_REF_COUNT = 0
+
+    def __init__(self, hef_path, batch_size=None, output_type='FLOAT32'):
+        """
+        Initialize the HailoAsyncInference class with the provided HEF model file path.
+
+        Args:
+            hef_path (str): Path to the HEF model file.
+            batch_size (int): Batch size for inference.
+            output_type (str): Format type of the output stream.
+        """
+        params = VDevice.create_params()
+        params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN
+
+        self.batch_size = batch_size
+        self.hef = HEF(hef_path)
+        if Hailo.TARGET is None:
+            Hailo.TARGET = VDevice(params)
+            Hailo.TARGET_REF_COUNT += 1
+        self.target = Hailo.TARGET
+        self.infer_model = self.target.create_infer_model(hef_path)
+        self.infer_model.set_batch_size(1 if batch_size is None else batch_size)
+        self._set_input_output(output_type)
+        self.input_vstream_info, self.output_vstream_info = self._get_vstream_info()
+        self.configured_infer_model = self.infer_model.configure()
+
+    def __enter__(self):
+        """Used for allowing use with context manager."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_traceback):
+        """Used for allowing use with context manager."""
+        self.close()
+
+    def _set_input_output(self, output_type):
+        """
+        Set the input and output layer information for the HEF model.
+
+        Args:
+            output_type (str): Format type of the output stream.
+        """
+        input_format_type = self.hef.get_input_vstream_infos()[0].format.type
+        self.infer_model.input().set_format_type(input_format_type)
+        output_format_type = getattr(FormatType, output_type)
+        for output in self.infer_model.outputs:
+            output.set_format_type(output_format_type)
+        self.num_outputs = len(self.infer_model.outputs)
+
+    def callback(self, completion_info, bindings, future, last):
+        """
+        Callback function for handling inference results.
+
+        Args:
+            completion_info: Information about the completion of the inference task.
+            bindings: Bindings object containing input and output buffers.
+        """
+        if future._has_had_error:
+            # Don't really know if this can happen.
+            return
+        elif completion_info.exception:
+            future._has_had_error = True
+            future.set_exception(completion_info.exception)
+        else:
+            if self.num_outputs <= 1:
+                # Only one output. Return the output directly.
+                if self.batch_size is None:
+                    # No batching. Return this single output on its own.
+                    future._intermediate_result = bindings.output().get_buffer()
+                else:
+                    # Return a list containing an output for each item in the batch.
+                    future._intermediate_result.append(bindings.output().get_buffer())
+            else:
+                # Multiple outputs. Return a dictionary of outputs keyed on the layer name.
+                if self.batch_size is None:
+                    # No batching. Use a single output as the value for each key.
+                    for name in bindings._output_names:
+                        future._intermediate_result[name] = bindings.output(name).get_buffer()
+                else:
+                    # Each key contains a list of outputs, one per item in the batch.
+                    for name in bindings._output_names:
+                        future._intermediate_result[name].append(bindings.output(name).get_buffer())
+            if last:
+                future.set_result(future._intermediate_result)
+
+    def _get_vstream_info(self):
+        """
+        Get information about input and output stream layers.
+
+        Returns:
+            tuple: List of input stream layer information, List of output stream layer information.
+        """
+        input_vstream_info = self.hef.get_input_vstream_infos()
+        output_vstream_info = self.hef.get_output_vstream_infos()
+
+        return input_vstream_info, output_vstream_info
+
+    def get_input_shape(self):
+        """
+        Get the shape of the model's input layer.
+
+        Returns:
+            tuple: Shape of the model's input layer.
+        """
+        return self.input_vstream_info[0].shape  # Assumes that the model has one input
+
+    def describe(self):
+        """
+        Return information that describes what's in the model.
+
+        Returns:
+            A pair of lists containing, respectively, information about the input and output layers.
+        """
+        inputs = [(layer.name, layer.shape, layer.format.type) for layer in self.hef.get_input_vstream_infos()]
+        outputs = [(layer.name, layer.shape, layer.format.type) for layer in self.hef.get_output_vstream_infos()]
+
+        return inputs, outputs
+
+    def run_async(self, input_data):
+        """
+        Run asynchronous inference on the Hailo-8 device.
+
+        Args:
+            input_data (np.ndarray): Input data for inference.
+
+        Returns:
+            future: Future to wait on for the inference results.
+        """
+        if self.batch_size is None:
+            input_data = np.expand_dims(input_data, axis=0)
+
+        future = Future()
+        future._has_had_error = False
+        if self.num_outputs <= 1:
+            future._intermediate_result = []
+        else:
+            future._intermediate_result = {output.name: [] for output in self.infer_model.outputs}
+
+        for i, frame in enumerate(input_data):
+            last = i == len(input_data) - 1
+            bindings = self._create_bindings()
+            bindings.input().set_buffer(frame)
+            self.configured_infer_model.wait_for_async_ready(timeout_ms=10000)
+            self.configured_infer_model.run_async([bindings],
+                                                  partial(self.callback, bindings=bindings, future=future, last=last))
+
+        return future
+
+    def run(self, input_data):
+        """
+        Run asynchronous inference on the Hailo-8 device.
+
+        Args:
+            input_data (np.ndarray): Input data for inference.
+
+        Returns:
+            inference output or list: Inference output or List of inference outputs if batch_size is not None.
+        """
+        future = self.run_async(input_data)
+        return future.result()
+
+    def _create_bindings(self):
+        """
+        Create bindings for input and output buffers.
+
+        Returns:
+            bindings: Bindings object with input and output buffers.
+        """
+        output_buffers = {name: np.empty(self.infer_model.output(name).shape, dtype=np.float32)
+                          for name in self.infer_model.output_names}
+        return self.configured_infer_model.create_bindings(output_buffers=output_buffers)
+
+    def close(self):
+        """Release the Hailo device."""
+        del self.configured_infer_model
+        Hailo.TARGET_REF_COUNT -= 1
+        if Hailo.TARGET_REF_COUNT == 0:
+            self.target.release()
diff --git a/picamera2/devices/imx500/__init__.py b/picamera2/devices/imx500/__init__.py
new file mode 100644
index 00000000..2cabd5ee
--- /dev/null
+++ b/picamera2/devices/imx500/__init__.py
@@ -0,0 +1,6 @@
+from .imx500 import IMX500, NetworkIntrinsics
+from .postprocess_efficientdet_lite0 import \
+    postprocess_efficientdet_lite0_detection
+from .postprocess_nanodet import postprocess_nanodet_detection
+from .postprocess_yolov5 import postprocess_yolov5_detection
+from .postprocess_yolov8 import postprocess_yolov8_detection
diff --git a/picamera2/devices/imx500/imx500.py b/picamera2/devices/imx500/imx500.py
new file mode 100644
index 00000000..ee944e78
--- /dev/null
+++ b/picamera2/devices/imx500/imx500.py
@@ -0,0 +1,719 @@
+import ctypes
+import fcntl
+import io
+import json
+import multiprocessing
+import os
+import struct
+import sys
+import time
+from typing import List, Optional
+
+import jsonschema
+import numpy as np
+from libarchive.read import fd_reader
+from libcamera import Rectangle, Size
+from tqdm import tqdm
+from v4l2 import (VIDIOC_S_CTRL, VIDIOC_S_EXT_CTRLS, v4l2_control,
+                  v4l2_ext_control, v4l2_ext_controls)
+
+from picamera2 import CompletedRequest, Picamera2
+
+NETWORK_NAME_LEN = 64
+MAX_NUM_TENSORS = 16
+MAX_NUM_DIMENSIONS = 16
+
+FW_LOADER_STAGE = 0
+FW_MAIN_STAGE = 1
+FW_NETWORK_STAGE = 2
+
+NETWORK_FW_FD_CTRL_ID = 0x00982901
+ROI_CTRL_ID = 0x00982900
+
+
+# struct OutputTensorInfo from libcamera
+class OutputTensorInfo(ctypes.LittleEndianStructure):
+    _fields_ = [
+        ('tensor_data_num', ctypes.c_uint32),
+        ('num_dimensions', ctypes.c_uint32),
+        ('size', ctypes.c_uint16 * MAX_NUM_DIMENSIONS),
+    ]
+
+
+# struct CnnOutputTensorInfoExported from libcamera
+class CnnOutputTensorInfoExported(ctypes.LittleEndianStructure):
+    _fields_ = [
+        ('network_name', ctypes.c_char * NETWORK_NAME_LEN),
+        ('num_tensors', ctypes.c_uint32),
+        ('info', OutputTensorInfo * MAX_NUM_TENSORS)
+    ]
+
+
+class NetworkIntrinsics:
+    def __init__(self, val=None):
+        self.__intrinsics: Optional[dict] = None
+        self.__schema = {
+            "$schema": "https://json-schema.org/draft-07/schema",
+            "title": "network_intrinsics",
+            "type": "object",
+            "properties": {
+                "task": {
+                    "type": "string",
+                    "enum": ["classification", "object detection", "pose estimation", "segmentation"],
+                    "description": "Network task",
+                },
+                "inference_rate": {"type": "number", "minimum": 0},
+                "cpu": {
+                    "type": "object",
+                    "properties": {
+                        "bbox_normalization": {"type": "boolean"},
+                        "bbox_order": {"type": "string", "enum": ["xy", "yx"]},
+                        "softmax": {"type": "boolean"},
+                        "post_processing": {"type": "string"},
+                    },
+                },
+                "input_aspect_ratio": {
+                    "type": "object",
+                    "properties": {
+                        "width": {"type": "integer", "exclusiveMinimum": 0},
+                        "height": {"type": "integer", "exclusiveMinimum": 0},
+                    },
+                    "required": ["width", "height"],
+                },
+                "classes": {
+                    "type": "object",
+                    "properties": {
+                        "labels": {"type": "array", "items": {"type": "string"}},
+                        "ignore_undefined": {"type": "boolean"},
+                    },
+                },
+            },
+        }
+        if val is not None:
+            jsonschema.validate(val, self.__schema)
+            self.__intrinsics = val
+
+        self.__defaults = {'inference_rate': 30.0}
+        jsonschema.validate(self.__defaults, self.__schema | {'additionalProperties': False})
+
+    @property
+    def intrinsics(self) -> Optional[dict]:
+        return self.__intrinsics
+
+    @intrinsics.setter
+    def intrinsics(self, val):
+        jsonschema.validate(val, self.__schema)
+        self.__intrinsics = val
+
+    def __repr__(self):
+        return json.dumps(self.__intrinsics) if self.__intrinsics else ""
+
+    def __top_level_validated_insert(self, val: dict):
+        jsonschema.validate(val, self.__schema | {'additionalProperties': False})
+        self.__intrinsics = self.__intrinsics | val if self.__intrinsics else val
+
+    def __intrinsics_has_key(self, key: str) -> bool:
+        return key in self.__intrinsics if self.__intrinsics else False
+
+    def __intrinsics_get_key(self, key, default=None):
+        return self.__intrinsics.get(key, default) if self.__intrinsics else default
+
+    def update_with_defaults(self):
+        # Updates intrinsics with default settings (but does not overwrite)
+        if not self.__intrinsics:
+            self.__intrinsics = {}
+        self.__intrinsics = self.__defaults | self.__intrinsics
+
+    @property
+    def task(self) -> Optional[str]:
+        return self.__intrinsics_get_key('task')
+
+    @task.setter
+    def task(self, val: str):
+        self.__top_level_validated_insert({'task': val})
+
+    @property
+    def inference_rate(self) -> Optional[float]:
+        return self.__intrinsics_get_key('inference_rate')
+
+    @inference_rate.setter
+    def inference_rate(self, val: float):
+        if val < 0:
+            if self.__intrinsics is not None:
+                self.__intrinsics.pop('inference_rate', None)
+        else:
+            self.__top_level_validated_insert({'inference_rate': val})
+
+    @property
+    def fps(self) -> Optional[float]:
+        # @deprecated("Prefer inference_rate")
+        return self.inference_rate
+
+    @fps.setter
+    def fps(self, val: Optional[float]):
+        # @deprecated("Prefer inference_rate")
+        self.inference_rate = val
+
+    def __get_cpu(self, key: str):
+        return self.__intrinsics['cpu'].get(key, None) if self.__intrinsics_has_key('cpu') else None
+
+    def __set_cpu(self, val: dict):
+        jsonschema.validate({'cpu': val}, self.__schema | {'additionalProperties': False})
+        cpu = self.__intrinsics_get_key('cpu', {}) | val
+        if self.__intrinsics:
+            self.__intrinsics['cpu'] = cpu
+        else:
+            self.__intrinsics = {'cpu': cpu}
+
+    @property
+    def bbox_normalization(self) -> Optional[bool]:
+        return self.__get_cpu('bbox_normalization')
+
+    @bbox_normalization.setter
+    def bbox_normalization(self, val: Optional[bool]):
+        if val is None:
+            return
+
+        if val:
+            self.__set_cpu({'bbox_normalization': val})
+        elif self.__intrinsics_has_key('cpu'):
+            self.__intrinsics['cpu'].pop('bbox_normalization', None)
+
+        if self.__intrinsics_has_key('cpu') and len(self.__intrinsics['cpu']) == 0:
+            self.__intrinsics.pop('cpu')
+
+    @property
+    def bbox_order(self) -> Optional[str]:
+        return self.__get_cpu('bbox_order')
+
+    @bbox_order.setter
+    def bbox_order(self, val: str):
+        if val not in ["xy", "yx"]:
+            raise ValueError("bbox_order must be either 'xy' or 'yx'")
+        self.__set_cpu({'bbox_order': val})
+        if self.__intrinsics_has_key('cpu') and len(self.__intrinsics['cpu']) == 0:
+            self.__intrinsics.pop('cpu')
+
+    @property
+    def softmax(self) -> Optional[bool]:
+        return self.__get_cpu('softmax')
+
+    @softmax.setter
+    def softmax(self, val: Optional[bool]):
+        if val is None:
+            return
+
+        if val:
+            self.__set_cpu({'softmax': val})
+        elif self.__intrinsics_has_key('cpu'):
+            self.__intrinsics['cpu'].pop('softmax', None)
+
+        if self.__intrinsics_has_key('cpu') and len(self.__intrinsics['cpu']) == 0:
+            self.__intrinsics.pop('cpu')
+
+    @property
+    def postprocess(self) -> Optional[str]:
+        return self.__get_cpu('post_processing')
+
+    @postprocess.setter
+    def postprocess(self, val: str):
+        if val != "":
+            self.__set_cpu({'post_processing': val})
+        elif self.__intrinsics_has_key('cpu'):
+            self.__intrinsics['cpu'].pop('post_processing', None)
+
+        if self.__intrinsics_has_key('cpu') and len(self.__intrinsics['cpu']) == 0:
+            self.__intrinsics.pop('cpu')
+
+    @property
+    def preserve_aspect_ratio(self) -> Optional[bool]:
+        if not self.__intrinsics_has_key('input_aspect_ratio'):
+            return None
+        ar = self.__intrinsics['input_aspect_ratio']
+        return ar['width'] == ar['height']
+
+    @preserve_aspect_ratio.setter
+    def preserve_aspect_ratio(self, val: Optional[bool]):
+        if val is None:
+            return
+
+        if val:
+            iar = {'input_aspect_ratio': {'width': 1, 'height': 1}}
+            self.__top_level_validated_insert(iar)
+        elif self.__intrinsics_has_key('input_aspect_ratio'):
+            self.__intrinsics.pop('input_aspect_ratio')
+
+    @property
+    def labels(self) -> Optional[List[str]]:
+        return self.__intrinsics['classes'].get('labels', None) if self.__intrinsics_has_key('classes') else None
+
+    @labels.setter
+    def labels(self, val: List[str]):
+        if len(val) != 0:
+            classes = {'labels': val}
+            jsonschema.validate({'classes': classes}, self.__schema | {'additionalProperties': False})
+
+            classes = self.__intrinsics_get_key('classes', {}) | classes
+            if self.__intrinsics:
+                self.__intrinsics['classes'] = classes
+            else:
+                self.__intrinsics = {'classes': classes}
+        elif self.__intrinsics_has_key('classes'):
+            self.__intrinsics['classes'].pop('labels', None)
+            if len(self.__intrinsics['classes']) == 0:
+                self.__intrinsics.pop('classes')
+
+    @property
+    def ignore_dash_labels(self) -> Optional[bool]:
+        return self.__intrinsics['classes'].get('ignore_undefined', None) if self.__intrinsics_has_key('classes') else None
+
+    @ignore_dash_labels.setter
+    def ignore_dash_labels(self, val: Optional[bool]):
+        if val is None:
+            return
+
+        if val:
+            iu = {'ignore_undefined': val}
+            jsonschema.validate({'classes': iu}, self.__schema | {'additionalProperties': False})
+
+            classes = {'classes': self.__intrinsics_get_key('classes', {}) | iu}
+            self.__intrinsics = self.__intrinsics | classes if self.__intrinsics else classes
+        elif self.__intrinsics_has_key('classes'):
+            self.__intrinsics['classes'].pop('ignore_undefined', None)
+            if len(self.__intrinsics['classes']) == 0:
+                self.__intrinsics.pop('classes')
+
+
+class IMX500:
+    def __init__(self, network_file: str, camera_id: str = ''):
+        self.device_fd = None
+        self.fw_progress = None
+        self.fw_progress_chunk = None
+        self.__cfg = {'network_file': network_file, 'input_tensor': {}}
+
+        imx500_device_id = None
+        spi_device_id = None
+        for i in range(32):
+            test_dir = f'/sys/class/video4linux/v4l-subdev{i}/device'
+            module_dir = f'{test_dir}/driver/module'
+            id_dir = f'{test_dir}/of_node'
+            if os.path.exists(module_dir) and os.path.islink(module_dir) and os.path.islink(id_dir) \
+                    and 'imx500' in os.readlink(module_dir):
+                if camera_id == '' or (camera_id in os.readlink(id_dir)):
+                    self.device_fd = open(f'/dev/v4l-subdev{i}', 'rb+', buffering=0)
+                    imx500_device_id = os.readlink(test_dir).split('/')[-1]
+                    spi_device_id = imx500_device_id.replace('001a', '0040')
+                    camera_info = Picamera2.global_camera_info()
+                    self.__camera_num = next((c['Num'] for c in camera_info if c['Model'] == 'imx500'
+                                              and c['Id'] in os.readlink(id_dir)))
+                    break
+
+        if self.device_fd is None:
+            raise RuntimeError('IMX500: Requested camera dev-node not found')
+
+        # Progress status specific debugfs entries.
+        if imx500_device_id:
+            self.fw_progress = open(f'/sys/kernel/debug/imx500-fw:{imx500_device_id}/fw_progress', 'r')
+        if spi_device_id:
+            self.fw_progress_chunk = open(f'/sys/kernel/debug/rp2040-spi:{spi_device_id}/transfer_progress', 'r')
+
+        if self.config['network_file'] != '':
+            self.__set_network_firmware(os.path.abspath(self.config['network_file']))
+            self.__ni_from_network(os.path.abspath(self.config['network_file']))
+
+        if 'norm_val' not in self.__cfg['input_tensor']:
+            self.__cfg['input_tensor']['norm_val'] = [-2048, -2048, -2048]
+        if 'norm_shift' not in self.__cfg:
+            self.__cfg['input_tensor']['norm_shift'] = [4, 4, 4]
+        if 'div_val' not in self.__cfg:
+            self.__cfg['input_tensor']['div_val'] = [1024, 1024, 1024]
+        if 'div_shift' not in self.__cfg:
+            self.__cfg['input_tensor']['div_shift'] = 6
+
+        full_sensor = self.__get_full_sensor_resolution()
+        self.set_inference_roi_abs(full_sensor.to_tuple())
+
+    @staticmethod
+    def __get_full_sensor_resolution():
+        """Full sensor resolution as a Rectangle object."""
+        return Rectangle(0, 0, 4056, 3040)
+
+    def __del__(self):
+        if self.device_fd:
+            self.device_fd.close()
+
+    @property
+    def camera_num(self):
+        return self.__camera_num
+
+    @property
+    def config(self) -> dict:
+        return self.__cfg
+
+    @property
+    def network_intrinsics(self) -> Optional[NetworkIntrinsics]:
+        return self.__cfg.get('intrinsics', None)
+
+    def convert_inference_coords(self, coords: tuple, metadata: dict, picam2: Picamera2, stream='main') -> tuple:
+        """Convert relative inference coordinates into the output image coordinates space."""
+        isp_output_size = Size(*picam2.camera_configuration()[stream]['size'])
+        sensor_output_size = Size(*picam2.camera_configuration()['raw']['size'])
+        scaler_crop = Rectangle(*metadata['ScalerCrop'])
+
+        y0, x0, y1, x1 = coords
+        full_sensor = self.__get_full_sensor_resolution()
+        width, height = full_sensor.size.to_tuple()
+        obj = Rectangle(
+            *np.maximum(
+                np.array([x0 * width, y0 * height, (x1 - x0) * width, (y1 - y0) * height]),
+                0,
+            ).astype(np.int32)
+        )
+        out = self.__get_obj_scaled(obj, isp_output_size, scaler_crop, sensor_output_size)
+        return out.to_tuple()
+
+    def get_fw_upload_progress(self, stage_req) -> tuple:
+        """Returns the current progress of the fw upload in the form of (current, total)."""
+        progress_block = 0
+        progress_chunk = 0
+        size = 0
+        stage = 0
+
+        if self.fw_progress:
+            self.fw_progress.seek(0)
+            progress = self.fw_progress.readline().strip().split()
+            stage = int(progress[0])
+            progress_block = int(progress[1])
+            size = int(progress[2])
+
+        if self.fw_progress_chunk:
+            self.fw_progress_chunk.seek(0)
+            progress_chunk = int(self.fw_progress_chunk.readline().strip())
+
+        if stage == stage_req:
+            return (min(progress_block + progress_chunk, size), size)
+        else:
+            return (0, 0)
+
+    def show_network_fw_progress_bar(self):
+        p = multiprocessing.Process(target=self.__do_progress_bar,
+                                    args=(FW_NETWORK_STAGE, 'Network Firmware Upload'))
+        p.start()
+        p.join(0)
+
+    def __do_progress_bar(self, stage_req, title):
+        with tqdm(unit='bytes', unit_scale=True, unit_divisor=1024, desc=title, leave=True) as t:
+            last_update = 0
+            while True:
+                current, total = self.get_fw_upload_progress(stage_req)
+                if total:
+                    t.total = total
+                    t.update(current - last_update)
+                    last_update = current
+                    if current > 0.95 * total:
+                        t.update(total - last_update)
+                        break
+                time.sleep(0.5)
+
+    def get_roi_scaled(self, request: CompletedRequest, stream="main") -> tuple:
+        """Get the region of interest (ROI) in output image coordinates space."""
+        picam2 = request.picam2
+        isp_output_size = self.get_isp_output_size(picam2, stream)
+        sensor_output_size = self.get_isp_output_size(picam2, 'raw')
+        scaler_crop = Rectangle(*request.get_metadata()['ScalerCrop'])
+        obj = self.__get_full_sensor_resolution()
+        roi = self.__get_obj_scaled(obj, isp_output_size, scaler_crop, sensor_output_size)
+        return roi.to_tuple()
+
+    @staticmethod
+    def get_isp_output_size(picam2, stream="main") -> tuple:
+        return Size(*picam2.camera_configuration()[stream]['size'])
+
+    def __get_obj_scaled(self, obj, isp_output_size, scaler_crop, sensor_output_size) -> Rectangle:
+        """Scale the object coordinates based on the camera configuration and sensor properties."""
+        full_sensor = self.__get_full_sensor_resolution()
+        width, height = full_sensor.size.to_tuple()
+        sensor_crop = scaler_crop.scaled_by(sensor_output_size, full_sensor.size)
+
+        # Make sure the object is bound to the user requested ROI.
+        if 'roi' in self.config and self.config['roi'] != Rectangle(0, 0, 0, 0):
+            obj = obj.bounded_to(self.config['roi'])
+
+        obj_sensor = obj.scaled_by(sensor_output_size, Size(width, height))
+        obj_bound = obj_sensor.bounded_to(sensor_crop)
+        obj_translated = obj_bound.translated_by(-sensor_crop.topLeft)
+        obj_scaled = obj_translated.scaled_by(isp_output_size, sensor_crop.size)
+        return obj_scaled
+
+    def get_input_size(self) -> tuple:
+        """Get the model input tensor size as (width, height)."""
+        return self.config['input_tensor_size']
+
+    def input_tensor_image(self, input_tensor):
+        """Convert input tensor in planar format to interleaved RGB."""
+        width = self.config['input_tensor']['width']
+        height = self.config['input_tensor']['height']
+        r1 = np.array(input_tensor, dtype=np.uint8).astype(np.int32).reshape((3,) + (height, width))
+        r1 = r1[(2, 1, 0), :, :]
+        norm_val = self.config['input_tensor']['norm_val']
+        norm_shift = self.config['input_tensor']['norm_shift']
+        div_val = self.config['input_tensor']['div_val']
+        div_shift = self.config['input_tensor']['div_shift']
+        for i in [0, 1, 2]:
+            r1[i] = ((((r1[i] << norm_shift[i]) - norm_val[i]) << div_shift) // div_val[i]) & 0xff
+
+        return np.transpose(r1, (1, 2, 0)).astype(np.uint8)
+
+    def get_outputs(self, metadata: dict, add_batch=False) -> Optional[list[np.ndarray]]:
+        """Get the model outputs."""
+        output_tensor = metadata.get('CnnOutputTensor')
+        if not output_tensor:
+            return None
+
+        np_output = np.fromiter(output_tensor, dtype=np.float32)
+        output_shapes = self.get_output_shapes(metadata)
+        offset = 0
+        outputs = []
+        for tensor_shape in output_shapes:
+            size = np.prod(tensor_shape)
+            reshaped_tensor = np_output[offset:offset + size].reshape(tensor_shape, order='F')
+            if add_batch:
+                reshaped_tensor = np.expand_dims(reshaped_tensor, 0)
+            outputs.append(reshaped_tensor)
+            offset += size
+        return outputs
+
+    def get_output_shapes(self, metadata: dict) -> list[tuple[int]]:
+        """Get the model output shapes if no output return empty list."""
+        output_tensor_info = metadata.get('CnnOutputTensorInfo')
+        if not output_tensor_info:
+            return []
+        output_tensor_info = self.__get_output_tensor_info(output_tensor_info)['info']
+        return [o['size'] for o in output_tensor_info]
+
+    def set_inference_roi_abs(self, roi: tuple):
+        """
+        Set the absolute inference image crop.
+
+        Specify an absolute region of interest in the form a (left, top, width, height) crop for the input inference
+        image. The co-ordinates are based on the full sensor resolution.
+        """
+        roi = Rectangle(*roi)
+        roi = roi.bounded_to(self.__get_full_sensor_resolution())
+
+        r = (ctypes.c_uint32 * 4)()
+        r[0] = roi.x
+        r[1] = roi.y
+        r[2] = roi.width
+        r[3] = roi.height
+
+        c = (v4l2_ext_control * 1)()
+        c[0].p_u32 = r
+        c[0].id = ROI_CTRL_ID
+        c[0].size = 16
+
+        ctrl = v4l2_ext_controls()
+        ctrl.count = 1
+        ctrl.controls = c
+
+        try:
+            fcntl.ioctl(self.device_fd, VIDIOC_S_EXT_CTRLS, ctrl)
+            self.__cfg['roi'] = roi
+        except OSError as err:
+            print(f'IMX500: Unable to set ROI control in the device driver: {err}')
+
+    def set_inference_aspect_ratio(self, aspect_ratio: tuple):
+        """
+        Set the aspect ratio of the inference image.
+
+        Specify a pixel aspect ratio needed for the input inference image relative to the full sensor resolution.
+        This simply calculates an ROI based on a centre crop and calls set_inference_roi_abs().
+        """
+        f = self.__get_full_sensor_resolution()
+        r = f.size.bounded_to_aspect_ratio(Size(aspect_ratio[0], aspect_ratio[1]))
+        r = r.centered_to(f.center).enclosed_in(f)
+        self.set_inference_roi_abs(r.to_tuple())
+
+    def set_auto_aspect_ratio(self):
+        """Set the inference image crop to presereve the input tensor aspect ratio."""
+        self.set_inference_aspect_ratio(self.config['input_tensor_size'])
+
+    def __get_output_tensor_info(self, tensor_info) -> dict:
+        """Return the network string along with a list of output tensor parameters."""
+        if type(tensor_info) not in [bytes, bytearray]:
+            tensor_info = bytes(tensor_info)
+
+        size = ctypes.sizeof(CnnOutputTensorInfoExported)
+        if len(tensor_info) != size:
+            raise ValueError(f'tensor info length {len(tensor_info)} does not match expected size {size}')
+
+        # Create an instance of the struct and copy data into it
+        parsed = CnnOutputTensorInfoExported()
+        ctypes.memmove(ctypes.addressof(parsed), tensor_info, size)
+
+        result = {
+            'network_name': parsed.network_name.decode('utf-8').strip('\x00'),
+            'num_tensors': parsed.num_tensors,
+            'info': []
+        }
+
+        for t in parsed.info[0:parsed.num_tensors]:
+            info = {
+                'tensor_data_num': t.tensor_data_num,
+                'num_dimensions': t.num_dimensions,
+                'size': list(t.size)[0:t.num_dimensions],
+            }
+            result['info'].append(info)
+
+        return result
+
+    def __get_input_tensor_info(self, tensor_info) -> tuple[str, int, int, int]:
+        """Return the input tensor parameters in the form (network_name, width, height, num_channels)."""
+        NETWORK_NAME_LEN = 64
+        tensor_fmt = f'{NETWORK_NAME_LEN}sIII'
+
+        if type(tensor_info) not in [bytes, bytearray]:
+            tensor_info = bytes(tensor_info)
+
+        network_name, width, height, num_channels = struct.unpack(tensor_fmt, tensor_info)
+        network_name = network_name.decode('utf-8').rstrip('\0')
+        return (network_name, width, height, num_channels)
+
+    @staticmethod
+    def get_kpi_info(metadata: dict) -> Optional[tuple[float, float]]:
+        """Return the KPI parameters in the form (dnn_runtime, dsp_runtime) in milliseconds."""
+        kpi_info = metadata.get('CnnKpiInfo')
+        if kpi_info is None:
+            return None
+        dnn_runtime, dsp_runtime = kpi_info[0], kpi_info[1]
+        return dnn_runtime / 1000, dsp_runtime / 1000
+
+    def __set_network_firmware(self, network_filename: str):
+        """Provides a firmware rpk file to upload to the IMX500. This must be called before Picamera2 is configured."""
+        if not os.path.isfile(network_filename):
+            raise RuntimeError(f'Firmware file {network_filename} does not exist.')
+
+        fd = os.open(network_filename, os.O_RDONLY)
+        if fd:
+            ctrl = v4l2_control()
+            ctrl.id = NETWORK_FW_FD_CTRL_ID
+            ctrl.value = fd
+
+            try:
+                fcntl.ioctl(self.device_fd, VIDIOC_S_CTRL, ctrl)
+                print('\n------------------------------------------------------------------------------------------------------------------\n'  # noqa
+                      'NOTE: Loading network firmware onto the IMX500 can take several minutes, please do not close down the application.'  # noqa
+                      '\n------------------------------------------------------------------------------------------------------------------\n', file=sys.stderr)  # noqa
+            except OSError as err:
+                raise RuntimeError(f'IMX500: Unable to set network firmware {network_filename}: {err}')
+            finally:
+                os.close(fd)
+
+    def __ni_from_network(self, network_filename: str):
+        """Extracts 'network_info.txt' from CPIO-archive appended to the network rpk."""
+        with open(network_filename, 'rb') as fp:
+            fw = memoryview(fp.read())
+
+        # Iterate through network firmware discarding blocks
+        cpio_offset = 0
+        while True:
+            # Parse header (+ current block size)
+            (magic, size) = struct.unpack('>4sI', fw[:8])
+            if not magic == b'9464':
+                break
+            fw = fw[size + 60:]
+            # Ensure footer is as expected
+            (magic,) = struct.unpack('4s', fw[:4])
+            if not magic == b'3695':
+                raise RuntimeError(f'No matching footer found in firmware file {network_filename}')
+            fw = fw[4:]
+            cpio_offset += size + 64
+
+        cpio_fd = os.open(network_filename, os.O_RDONLY)
+        os.lseek(cpio_fd, cpio_offset, os.SEEK_SET)
+
+        with fd_reader(cpio_fd) as archive:
+            for entry in archive:
+                if 'network_info.txt' == str(entry):
+                    self.__cfg['network_info_raw'] = b''.join(entry.get_blocks())
+                elif 'network_intrinsics' == str(entry):
+                    self.__cfg['intrinsics'] = NetworkIntrinsics(json.loads(b''.join(entry.get_blocks())))
+
+        os.close(cpio_fd)
+
+        if 'network_info_raw' not in self.__cfg:
+            return
+
+        res = {}
+        buf = io.StringIO(self.__cfg['network_info_raw'].decode('ascii'))
+        for line in buf:
+            key, value = line.strip().split('=')
+            if key == 'networkID':
+                nid: int = 0
+                for idx, x in enumerate(value):
+                    nid |= (ord(x) - ord('0')) << (20 - idx * 4)
+                res[key] = nid
+            if key == 'apParamSize':
+                res[key] = int(value)
+            if key == 'networkNum':
+                res[key] = int(value)
+
+        res['network'] = {}
+        networks = self.__cfg['network_info_raw'].decode('ascii').split('networkOrdinal=')[1:]
+        for nw in networks:
+            buf = io.StringIO(nw)
+            nw_idx = int(buf.readline())
+            nw_properties = {}
+            for line in buf:
+                key, value = line.strip().split('=')
+                nw_properties[key] = value
+            res['network'][nw_idx] = nw_properties
+
+        if len(res['network']) != res['networkNum']:
+            raise RuntimeError('Insufficient networkNum settings in network_info.txt')
+
+        self.__cfg['network_info'] = res
+
+        # Extract some input tensor config params
+        self.__cfg['input_tensor']['width'] = int(res['network'][0]['inputTensorWidth'])
+        self.__cfg['input_tensor']['height'] = int(res['network'][0]['inputTensorHeight'])
+        self.__cfg['input_tensor_size'] = (self.config['input_tensor']['width'],
+                                           self.config['input_tensor']['height'])
+
+        input_format = self.__cfg['network_info']['network'][0]['inputTensorFormat']
+        inputTensorNorm_K03 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K03'], 0)
+        inputTensorNorm_K13 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K13'], 0)
+        inputTensorNorm_K23 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K23'], 0)
+        inputTensorNorm_K00 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K00'], 0)
+        inputTensorNorm_K22 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K22'], 0)
+        inputTensorNorm_K02 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K02'], 0)
+        inputTensorNorm_K20 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K20'], 0)
+        inputTensorNorm_K11 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K11'], 0)
+
+        self.__cfg['input_tensor']['input_format'] = input_format
+
+        if input_format == 'RGB' or input_format == 'BGR':
+            norm_val_0 = \
+                inputTensorNorm_K03 if ((inputTensorNorm_K03 >> 12) & 1) == 0 else -((~inputTensorNorm_K03 + 1) & 0x1fff)
+            norm_val_1 = \
+                inputTensorNorm_K13 if ((inputTensorNorm_K13 >> 12) & 1) == 0 else -((~inputTensorNorm_K13 + 1) & 0x1fff)
+            norm_val_2 = \
+                inputTensorNorm_K23 if ((inputTensorNorm_K23 >> 12) & 1) == 0 else -((~inputTensorNorm_K23 + 1) & 0x1fff)
+            norm_val = [norm_val_0, norm_val_1, norm_val_2]
+            self.__cfg['input_tensor']['norm_val'] = norm_val
+            norm_shift = [4, 4, 4]
+            self.__cfg['input_tensor']['norm_shift'] = norm_shift
+            if input_format == 'RGB':
+                div_val_0 = \
+                    inputTensorNorm_K00 if ((inputTensorNorm_K00 >> 11) & 1) == 0 else -((~inputTensorNorm_K00 + 1) & 0x0fff)
+                div_val_2 =\
+                    inputTensorNorm_K22 if ((inputTensorNorm_K22 >> 11) & 1) == 0 else -((~inputTensorNorm_K22 + 1) & 0x0fff)
+            else:
+                div_val_0 = \
+                    inputTensorNorm_K02 if ((inputTensorNorm_K02 >> 11) & 1) == 0 else -((~inputTensorNorm_K02 + 1) & 0x0fff)
+                div_val_2 = \
+                    inputTensorNorm_K20 if ((inputTensorNorm_K20 >> 11) & 1) == 0 else -((~inputTensorNorm_K20 + 1) & 0x0fff)
+            div_val_1 = \
+                inputTensorNorm_K11 if ((inputTensorNorm_K11 >> 11) & 1) == 0 else -((~inputTensorNorm_K11 + 1) & 0x0fff)
+            self.__cfg['input_tensor']['div_val'] = [div_val_0, div_val_1, div_val_2]
+            self.__cfg['input_tensor']['div_shift'] = 6
diff --git a/picamera2/devices/imx500/postprocess.py b/picamera2/devices/imx500/postprocess.py
new file mode 100644
index 00000000..ee1c825e
--- /dev/null
+++ b/picamera2/devices/imx500/postprocess.py
@@ -0,0 +1,414 @@
+"""
+This code is based on multiple sources:
+
+https://github.com/rbgirshick/fast-rcnn
+https://github.com/ultralytics/ultralytics
+https://github.com/see--/keras-centernet
+https://github.com/stefanopini/simple-HigherHRNet
+"""
+
+from enum import Enum
+from typing import List
+
+import cv2
+import numpy as np
+
+from picamera2 import Picamera2
+
+
+def nms(dets: np.ndarray, scores: np.ndarray, iou_thres: float = 0.55, max_out_dets: int = 50) -> List[int]:
+    """
+    Perform Non-Maximum Suppression (NMS) on detected bounding boxes.
+
+    Args:
+        dets (np.ndarray): Array of bounding box coordinates of shape (N, 4) representing [y1, x1, y2, x2].
+        scores (np.ndarray): Array of confidence scores associated with each bounding box.
+        iou_thres (float, optional): IoU threshold for NMS. Default is 0.5.
+        max_out_dets (int, optional): Maximum number of output detections to keep. Default is 300.
+
+    Returns:
+        List[int]: List of indices representing the indices of the bounding boxes to keep after NMS.
+
+    """
+    y1, x1 = dets[:, 0], dets[:, 1]
+    y2, x2 = dets[:, 2], dets[:, 3]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= iou_thres)[0]
+        order = order[inds + 1]
+
+    return keep[:max_out_dets]
+
+
+def combined_nms(batch_boxes, batch_scores, iou_thres: float = 0.65, conf: float = 0.55, max_out_dets: int = 50):
+    nms_results = []
+    for boxes, scores in zip(batch_boxes, batch_scores):
+        xc = np.argmax(scores, 1)
+        xs = np.amax(scores, 1)
+        x = np.concatenate([boxes, np.expand_dims(xs, 1), np.expand_dims(xc, 1)], 1)
+
+        xi = xs > conf
+        x = x[xi]
+
+        x = x[np.argsort(-x[:, 4])[:8400]]
+        scores = x[:, 4]
+        x[..., :4] = convert_to_ymin_xmin_ymax_xmax_format(x[..., :4], BoxFormat.XC_YC_W_H)
+        offset = x[:, 5] * 640
+        boxes = x[..., :4] + np.expand_dims(offset, 1)
+
+        # Original post-processing part
+        valid_indexs = nms(boxes, scores, iou_thres=iou_thres, max_out_dets=max_out_dets)
+        x = x[valid_indexs]
+        nms_classes = x[:, 5]
+        nms_bbox = x[:, :4]
+        nms_scores = x[:, 4]
+
+        nms_results.append((nms_bbox, nms_scores, nms_classes))
+
+    return nms_results
+
+
+def combined_nms_seg(batch_boxes, batch_scores, batch_masks, iou_thres: float = 0.5, conf: float = 0.001,
+                     max_out_dets: int = 300):
+    nms_results = []
+    for boxes, scores, masks in zip(batch_boxes, batch_scores, batch_masks):
+        # Compute maximum scores and corresponding class indices
+        class_indices = np.argmax(scores, axis=1)
+        max_scores = np.amax(scores, axis=1)
+        detections = np.concatenate([boxes, np.expand_dims(max_scores, axis=1), np.expand_dims(class_indices, axis=1)],
+                                    axis=1)
+
+        # Swap the position of the two dimensions (32, 8400) to (8400, 32)
+        masks = np.transpose(masks, (1, 0))
+        # Filter out detections below the confidence threshold
+        valid_detections = max_scores > conf
+
+        if np.all(valid_detections is False):
+            nms_results.append((np.ndarray(0), np.ndarray(0), np.ndarray(0), np.ndarray(0)))
+        else:
+
+            detections = detections[valid_detections]
+            masks = masks[valid_detections]
+
+            # Sort detections by score in descending order
+            sorted_indices = np.argsort(-detections[:, 4])
+            detections = detections[sorted_indices]
+            masks = masks[sorted_indices]
+
+            detections[..., :4] = convert_to_ymin_xmin_ymax_xmax_format(detections[..., :4], BoxFormat.XC_YC_W_H)
+
+            # Perform class-wise NMS
+            unique_classes = np.unique(detections[:, 5])
+            final_indices = []
+
+            for cls in unique_classes:
+                cls_indices = np.where(detections[:, 5] == cls)[0]
+                cls_boxes = detections[cls_indices, :4]
+                cls_scores = detections[cls_indices, 4]
+                cls_valid_indices = nms(cls_boxes, cls_scores, iou_thres=iou_thres, max_out_dets=max_out_dets)
+                final_indices.extend(cls_indices[cls_valid_indices])
+
+            final_indices = np.array(final_indices)
+            final_detections = detections[final_indices]
+            final_masks = masks[final_indices]
+
+            # Extract class indices, bounding boxes, and scores
+            nms_classes = final_detections[:, 5]
+            nms_bbox = final_detections[:, :4]
+            nms_scores = final_detections[:, 4]
+
+            # Append results including masks
+            nms_results.append((nms_bbox, nms_scores, nms_classes, final_masks))
+    return nms_results
+
+
+class BoxFormat(Enum):
+    YMIM_XMIN_YMAX_XMAX = 'ymin_xmin_ymax_xmax'
+    XMIM_YMIN_XMAX_YMAX = 'xmin_ymin_xmax_ymax'
+    XMIN_YMIN_W_H = 'xmin_ymin_width_height'
+    XC_YC_W_H = 'xc_yc_width_height'
+
+
+def convert_to_ymin_xmin_ymax_xmax_format(boxes, orig_format: BoxFormat):
+    """
+    Changes the box from one format to another (XMIN_YMIN_W_H --> YMIM_XMIN_YMAX_XMAX )
+
+    Also support in same format mode (returns the same format)
+
+    :param boxes:
+    :param orig_format:
+    :return: box in format YMIM_XMIN_YMAX_XMAX
+    """
+    if len(boxes) == 0:
+        return boxes
+    elif orig_format == BoxFormat.YMIM_XMIN_YMAX_XMAX:
+        return boxes
+    elif orig_format == BoxFormat.XMIN_YMIN_W_H:
+        boxes[:, 2] += boxes[:, 0]  # convert width to xmax
+        boxes[:, 3] += boxes[:, 1]  # convert height to ymax
+        boxes[:, 0], boxes[:, 1] = boxes[:, 1], boxes[:, 0].copy()  # swap xmin, ymin columns
+        boxes[:, 2], boxes[:, 3] = boxes[:, 3], boxes[:, 2].copy()  # swap xmax, ymax columns
+        return boxes
+    elif orig_format == BoxFormat.XMIM_YMIN_XMAX_YMAX:
+        boxes[:, 0], boxes[:, 1] = boxes[:, 1], boxes[:, 0].copy()  # swap xmin, ymin columns
+        boxes[:, 2], boxes[:, 3] = boxes[:, 3], boxes[:, 2].copy()  # swap xmax, ymax columns
+        return boxes
+    elif orig_format == BoxFormat.XC_YC_W_H:
+        new_boxes = np.copy(boxes)
+        new_boxes[:, 0] = boxes[:, 1] - boxes[:, 3] / 2  # top left y
+        new_boxes[:, 1] = boxes[:, 0] - boxes[:, 2] / 2  # top left x
+        new_boxes[:, 2] = boxes[:, 1] + boxes[:, 3] / 2  # bottom right y
+        new_boxes[:, 3] = boxes[:, 0] + boxes[:, 2] / 2  # bottom right x
+        return new_boxes
+    else:
+        raise Exception("Unsupported boxes format")
+
+
+def clip_boxes(boxes: np.ndarray, h: int, w: int) -> np.ndarray:
+    """
+    Clip bounding boxes to stay within the image boundaries.
+
+    Args:
+        boxes (numpy.ndarray): Array of bounding boxes in format [y_min, x_min, y_max, x_max].
+        h (int): Height of the image.
+        w (int): Width of the image.
+
+    Returns:
+        numpy.ndarray: Clipped bounding boxes.
+    """
+    boxes[..., 0] = np.clip(boxes[..., 0], a_min=0, a_max=h)
+    boxes[..., 1] = np.clip(boxes[..., 1], a_min=0, a_max=w)
+    boxes[..., 2] = np.clip(boxes[..., 2], a_min=0, a_max=h)
+    boxes[..., 3] = np.clip(boxes[..., 3], a_min=0, a_max=w)
+    return boxes
+
+
+def scale_boxes(boxes: np.ndarray, h_image: int, w_image: int, h_model: int, w_model: int, preserve_aspect_ratio: bool,
+                normalized: bool = True) -> np.ndarray:
+    """
+    Scale and offset bounding boxes based on model output size and original image size.
+
+    Args:
+        boxes (numpy.ndarray): Array of bounding boxes in format [y_min, x_min, y_max, x_max].
+        h_image (int): Original image height.
+        w_image (int): Original image width.
+        h_model (int): Model output height.
+        w_model (int): Model output width.
+        preserve_aspect_ratio (bool): Whether to preserve image aspect ratio during scaling
+
+    Returns:
+        numpy.ndarray: Scaled and offset bounding boxes.
+    """
+    deltaH, deltaW = 0, 0
+    H, W = h_model, w_model
+    scale_H, scale_W = h_image / H, w_image / W
+
+    if preserve_aspect_ratio:
+        scale_H = scale_W = max(h_image / H, w_image / W)
+        H_tag = int(np.round(h_image / scale_H))
+        W_tag = int(np.round(w_image / scale_W))
+        deltaH, deltaW = int((H - H_tag) / 2), int((W - W_tag) / 2)
+
+    nh, nw = (H, W) if normalized else (1, 1)
+
+    # Scale and offset boxes
+    # [y_min, x_min, y_max, x_max].
+    boxes[..., 0] = (boxes[..., 0] * nw - deltaW) * scale_W
+    boxes[..., 1] = (boxes[..., 1] * nh - deltaH) * scale_H
+    boxes[..., 2] = (boxes[..., 2] * nw - deltaW) * scale_W
+    boxes[..., 3] = (boxes[..., 3] * nh - deltaH) * scale_H
+
+    # Clip boxes
+    boxes = clip_boxes(boxes, h_image, w_image)
+
+    return boxes
+
+
+def scale_coords(kpts: np.ndarray, h_image: int, w_image: int, h_model: int, w_model: int,
+                 preserve_aspect_ratio: bool) -> np.ndarray:
+    """
+    Scale and offset keypoints based on model output size and original image size.
+
+    Args:
+        kpts (numpy.ndarray): Array of bounding keypoints in format [..., 17, 3]  where the last dim is (x, y, visible).
+        h_image (int): Original image height.
+        w_image (int): Original image width.
+        h_model (int): Model output height.
+        w_model (int): Model output width.
+        preserve_aspect_ratio (bool): Whether to preserve image aspect ratio during scaling
+
+    Returns:
+        numpy.ndarray: Scaled and offset bounding boxes.
+    """
+    deltaH, deltaW = 0, 0
+    H, W = h_model, w_model
+    scale_H, scale_W = h_image / H, w_image / W
+
+    if preserve_aspect_ratio:
+        scale_H = scale_W = max(h_image / H, w_image / W)
+        H_tag = int(np.round(h_image / scale_H))
+        W_tag = int(np.round(w_image / scale_W))
+        deltaH, deltaW = int((H - H_tag) / 2), int((W - W_tag) / 2)
+
+    # Scale and offset boxes
+    kpts[..., 0] = (kpts[..., 0] - deltaH) * scale_H
+    kpts[..., 1] = (kpts[..., 1] - deltaW) * scale_W
+
+    # Clip boxes
+    kpts = clip_coords(kpts, h_image, w_image)
+
+    return kpts
+
+
+def clip_coords(kpts: np.ndarray, h: int, w: int) -> np.ndarray:
+    """
+    Clip keypoints to stay within the image boundaries.
+
+    Args:
+        kpts (numpy.ndarray): Array of bounding keypoints in format [..., 17, 3]  where the last dim is (x, y, visible).
+        h (int): Height of the image.
+        w (int): Width of the image.
+
+    Returns:
+        numpy.ndarray: Clipped bounding boxes.
+    """
+    kpts[..., 0] = np.clip(kpts[..., 0], a_min=0, a_max=h)
+    kpts[..., 1] = np.clip(kpts[..., 1], a_min=0, a_max=w)
+    return kpts
+
+
+PARTS = {
+    0: 'Nose',
+    1: 'EyeL',
+    2: 'EyeR',
+    3: 'EarL',
+    4: 'EarR',
+    5: 'SholderL',
+    6: 'SholderR',
+    7: 'ElbowL',
+    8: 'ElbowR',
+    9: 'WristL',
+    10: 'WristR',
+    11: 'HipL',
+    12: 'HipR',
+    13: 'KneeL',
+    14: 'KneeR',
+    15: 'AnkleL',
+    16: 'AnkleR'
+}
+
+
+class COCODrawer:
+    def __init__(self, categories, imx500, needs_rescale_coords=True):
+        self.categories = categories
+        self.imx500 = imx500
+        self.needs_rescale_coords = needs_rescale_coords
+
+    def get_coords(self, annotation, metadata: dict, picam2: Picamera2, stream):
+        if self.needs_rescale_coords:
+            obj_scaled = self.imx500.convert_inference_coords(annotation, metadata, picam2, stream)
+            x0 = obj_scaled.x
+            y0 = obj_scaled.y
+            x1 = x0 + obj_scaled.width
+            y1 = y0 + obj_scaled.height
+        else:
+            y0, x0, y1, x1 = annotation
+            y0 = max(0, y0)
+            x0 = max(0, x0)
+        return int(y0), int(x0), int(y1), int(x1)
+
+    def draw_bounding_box(self, img, annotation, class_id, score, metadata: dict, picam2: Picamera2, stream):
+        y0, x0, y1, x1 = self.get_coords(annotation, metadata, picam2, stream)
+        text = f"{self.categories[int(class_id)]}:{score:.3f}"
+        cv2.rectangle(img, (x0, y0), (x1, y1), (0, 0, 255), 2)
+        cv2.putText(img, text, (x0, y0), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+
+    def draw_keypoints(self, img, keypoints, min_confidence, metadata: dict, picam2: Picamera2, stream):
+        def get_point(index):
+            y0, x0 = keypoints[index][1], keypoints[index][0]
+            y0, x0, _, _ = self.get_coords((y0, x0, y0 + 1, x0 + 1), metadata, picam2, stream)
+            return x0, y0
+
+        skeleton = [
+            [0, 1], [0, 2], [1, 3], [2, 4],  # Head
+            [5, 6], [5, 7], [7, 9], [6, 8],  # Arms
+            [8, 10], [5, 11], [6, 12], [11, 12],  # Body
+            [11, 13], [12, 14], [13, 15], [14, 16]  # Legs
+        ]
+
+        # Draw skeleton lines
+        for connection in skeleton:
+            start_point = get_point(connection[0])
+            end_point = get_point(connection[1])
+            start_confidence = keypoints[connection[0]][2]
+            end_confidence = keypoints[connection[1]][2]
+            if start_confidence < min_confidence or end_confidence < min_confidence:
+                continue
+            cv2.line(img, start_point, end_point, (255, 0, 0), 2)
+
+        # Draw keypoints as colored circles
+        for i in range(len(keypoints)):
+            x, y = get_point(i)
+            confidence = keypoints[i][2]
+            if confidence < min_confidence:
+                continue
+            cv2.circle(img, (x, y), 3, (0, 255, 0), -1)
+            label = f"{PARTS.get(i)}.{confidence:.3f}"
+            cv2.putText(img, label, (x + 5, y + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.25, (0, 255, 0), 1)
+
+    def annotate_image(self, img, b, s, c, k, box_min_conf, kps_min_conf, metadata: dict, picam2: Picamera2, stream):
+        for index, row in enumerate(b):
+            if s[index] >= box_min_conf:
+                self.draw_bounding_box(img, row, c[index], s[index], metadata, picam2, stream)
+                if k is not None:
+                    self.draw_keypoints(img, k[index], kps_min_conf, metadata, picam2, stream)
+
+    def overlay_masks(self, picam2, masks, scores, colors, score_threshold=0.55, mask_threshold=0.5):
+        overlay = np.zeros((masks.shape[1], masks.shape[2], 4), dtype=np.uint8)
+        for idx, (mask, score) in enumerate(zip(masks, scores)):
+            if score > score_threshold:  # Check if the score is above the threshold
+                binary_mask = (mask > mask_threshold).astype(np.uint8)
+                color = np.array(colors[idx][:3]) * 255  # Convert color to 0-255 scale
+                overlay[binary_mask == 1, :3] = color
+                overlay[binary_mask == 1, 3] = 127  # opacity
+        picam2.set_overlay(overlay)
+
+
+def softmax(x):
+    y = np.exp(x - np.expand_dims(np.max(x, axis=-1), axis=-1))
+    z = y / np.expand_dims(np.sum(y, axis=-1), axis=-1)
+    return z
+
+
+def crop_mask(masks, boxes):
+    """
+    It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box
+
+    Args:
+      masks (numpy.ndarray): [h, w, n] tensor of masks
+      boxes (numpy.ndarray): [n, 4] tensor of bbox coordinates in relative point form
+
+    Returns:
+      (numpy.ndarray): The masks are being cropped to the bounding box.
+    """
+    n, w, h = masks.shape
+    x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1)
+    c = np.arange(h, dtype=np.float32)[None, None, :]  # rows shape(1,w,1)
+    r = np.arange(w, dtype=np.float32)[None, :, None]  # cols shape(h,1,1)
+
+    return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
diff --git a/picamera2/devices/imx500/postprocess_efficientdet_lite0.py b/picamera2/devices/imx500/postprocess_efficientdet_lite0.py
new file mode 100644
index 00000000..3ffd7fd4
--- /dev/null
+++ b/picamera2/devices/imx500/postprocess_efficientdet_lite0.py
@@ -0,0 +1,213 @@
+"""
+Efficientdet postprocessing
+
+This code is based on:
+https://github.com/google/automl/tree/master/efficientdet
+"""
+
+from typing import Tuple
+
+import numpy as np
+
+from picamera2.devices.imx500.postprocess import (
+    BoxFormat, convert_to_ymin_xmin_ymax_xmax_format, nms)
+from picamera2.devices.imx500.postprocess_yolov5 import coco80_to_coco91
+
+default_box_variance = [1.0, 1.0, 1.0, 1.0]
+default_aspect_ratios = [1.0, 2.0, 0.5]
+
+
+def postprocess_efficientdet_lite0_detection(outputs: Tuple[np.ndarray, np.ndarray, np.ndarray],
+                                             anchor_scale=3,
+                                             min_level=3,
+                                             max_level=7,
+                                             box_variance=default_box_variance,
+                                             model_input_shape=(320, 320),
+                                             min_wh=2,
+                                             max_wh=7680,
+                                             conf_thres: float = 0.001,
+                                             iou_thres: float = 0.65,
+                                             max_nms_dets: int = 5000,
+                                             max_out_dets: int = 1000):
+    H, W = model_input_shape
+    ############################################################
+    # Box decoding
+    ############################################################
+    outputs_decoded = box_decoding_edetlite(output_annotations=outputs,
+                                            H=H,
+                                            W=W,
+                                            anchor_scale=anchor_scale,
+                                            min_level=min_level,
+                                            max_level=max_level,
+                                            box_variance=box_variance)
+
+    classes = outputs[0]
+    num_categories = classes.shape[-1]
+
+    ############################################################
+    # Post processing for each input image
+    ############################################################
+    # Note: outputs_decoded shape is [Batch,num_anchors*Detections,(4+1+num_categories)]
+    post_processed_outputs = []
+    for _, x in enumerate(outputs_decoded):
+        # ----------------------------------------
+        # Filter by score and width-height
+        # ----------------------------------------
+        scores = x[..., 4]
+        wh = x[..., 2:4]
+        valid_indexs = (scores > conf_thres) & ((wh > min_wh).any(1)) & ((wh < max_wh).any(1))
+        x = x[valid_indexs]
+
+        # ----------------------------------------
+        # Taking Best class only
+        # ----------------------------------------
+        x[..., 5:] *= x[..., 4:5]  # compute confidence per class (class_score * object_score)
+        conf = np.max(x[:, 5:], axis=1, keepdims=True)
+        classes_id = np.argmax(x[:, 5:], axis=1, keepdims=True)
+
+        # Change boxes format from [x_c,y_c,w,h] to [y_min,x_min,y_max,x_max]
+        boxes = convert_to_ymin_xmin_ymax_xmax_format(x[..., :4], BoxFormat.XC_YC_W_H)
+        x = np.concatenate((boxes, conf, classes_id), axis=1)[conf.reshape(-1) > conf_thres]
+
+        # --------------------------- #
+        # NMS
+        # --------------------------- #
+        x = x[np.argsort(-x[:, 4])[:max_nms_dets]]  # sort by confidence from high to low
+        offset = x[..., 5:6] * np.maximum(H, W)
+        boxes_offset, scores = x[..., :4] + offset, x[..., 4]  # boxes with offset by class
+        valid_indexs = nms(dets=boxes_offset, scores=scores, iou_thres=iou_thres, max_out_dets=max_out_dets)
+        x = x[valid_indexs]
+
+        boxes = x[..., :4]
+
+        # --------------------------- #
+        # Classes process
+        # --------------------------- #
+        # convert classes from coco80 to coco91 to match labels
+        classes = coco80_to_coco91(x[..., 5]) if num_categories == 80 else x[..., 5]
+        classes -= 0
+
+        # --------------------------- #
+        # Scores
+        # --------------------------- #
+        scores = x[..., 4]
+
+        # Add result
+        post_processed_outputs.append({'boxes': boxes, 'classes': classes, 'scores': scores})
+
+    return post_processed_outputs[0]['boxes'], post_processed_outputs[0]['scores'], post_processed_outputs[0]['classes']
+
+
+def box_decoding_edetlite(output_annotations,
+                          H=320,
+                          W=320,
+                          anchor_scale=3,
+                          min_level=3,
+                          max_level=7,
+                          box_variance=default_box_variance):
+    # -----------------------------------------------
+    # EfficientDetLite detection post processing
+    # -----------------------------------------------
+    # Note: 'output_annotations' is expected to be a list of 2 feature maps with shapes:
+    # [0] : [Batch,Detections,num_categories]
+    # [1] : [Batch,Detections,4]
+    classes = output_annotations[0]
+    boxes = output_annotations[1]
+    classes = 1 / (1 + np.exp(-classes))  # sigmoid
+    scores = np.ones((*boxes.shape[:-1], 1))  # Add default object scores of 1.0
+
+    # Combine tensors
+    outputs = np.concatenate((boxes, scores, classes), axis=2)
+
+    # Box decoding
+    # Anchor boxes format: [y_min, x_min, y_max, x_max] normalized
+
+    # Extract feature map sizes
+    strides = [2 ** i for i in range(max_level + 1)]
+    featmap_sizes = [(np.ceil(H / stride), np.ceil(W / stride)) for stride in strides]
+
+    # Generate priors
+    batch_size = outputs.shape[0]
+    anchors = generate_anchors_EDETLITE(batch_size=batch_size,
+                                        featmap_sizes=featmap_sizes,
+                                        H=H,
+                                        W=W,
+                                        anchor_scale=anchor_scale,
+                                        min_level=min_level,
+                                        max_level=max_level)
+
+    # Decode bboxes
+    y_c_anchors = (anchors[..., 0:1] + anchors[..., 2:3]) / 2
+    x_c_anchors = (anchors[..., 1:2] + anchors[..., 3:4]) / 2
+    ha = anchors[..., 2:3] - anchors[..., 0:1]
+    wa = anchors[..., 3:4] - anchors[..., 1:2]
+
+    # Output Box format: [x_c, y_c, w, h]
+    pred_boxes = outputs[..., :4]
+    y_c = pred_boxes[..., 0:1] * box_variance[0] * ha + y_c_anchors
+    x_c = pred_boxes[..., 1:2] * box_variance[1] * wa + x_c_anchors
+    h = np.exp(pred_boxes[..., 2:3] * box_variance[2]) * ha
+    w = np.exp(pred_boxes[..., 3:4] * box_variance[3]) * wa
+    outputs[..., 0:1] = x_c
+    outputs[..., 1:2] = y_c
+    outputs[..., 2:3] = w
+    outputs[..., 3:4] = h
+    return outputs
+
+
+def generate_anchors_EDETLITE(batch_size,
+                              featmap_sizes,
+                              H=320,
+                              W=320,
+                              anchor_scale=3,
+                              min_level=3,
+                              max_level=7,
+                              aspect_ratios=default_aspect_ratios):
+    """Generate configurations of anchor boxes."""
+    anchor_scales = [anchor_scale] * (max_level - min_level + 1)
+    num_scales = len(aspect_ratios)
+    anchor_configs = {}
+    for level in range(min_level, max_level + 1):
+        anchor_configs[level] = []
+        for scale_octave in range(num_scales):
+            for aspect in aspect_ratios:
+                anchor_configs[level].append(
+                    ((featmap_sizes[0][0] / float(featmap_sizes[level][0]),
+                      featmap_sizes[0][1] / float(featmap_sizes[level][1])),
+                     scale_octave / float(num_scales), aspect,
+                     anchor_scales[level - min_level]))
+
+    """Generates multiscale anchor boxes."""
+    boxes_all = []
+    for _, configs in anchor_configs.items():
+        boxes_level = []
+        for config in configs:
+            stride, octave_scale, aspect, anchor_scale = config
+            base_anchor_size_x = anchor_scale * stride[1] * 2 ** octave_scale
+            base_anchor_size_y = anchor_scale * stride[0] * 2 ** octave_scale
+            if isinstance(aspect, list):
+                aspect_x, aspect_y = aspect
+            else:
+                aspect_x = np.sqrt(aspect)
+                aspect_y = 1.0 / aspect_x
+            anchor_size_x_2 = base_anchor_size_x * aspect_x / 2.0
+            anchor_size_y_2 = base_anchor_size_y * aspect_y / 2.0
+
+            x = np.arange(stride[1] / 2, W, stride[1])
+            y = np.arange(stride[0] / 2, H, stride[0])
+            xv, yv = np.meshgrid(x, y)
+            xv = xv.reshape(-1)
+            yv = yv.reshape(-1)
+
+            boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
+                               yv + anchor_size_y_2, xv + anchor_size_x_2))
+            boxes = np.swapaxes(boxes, 0, 1)
+            boxes_level.append(np.expand_dims(boxes, axis=1))
+
+        # concat anchors on the same level to the shape Batch x Detections x 4
+        boxes_level = np.concatenate(boxes_level, axis=1).reshape([1, -1, 4])
+        boxes_level = np.repeat(boxes_level, batch_size, axis=0)
+        boxes_all.append(boxes_level)
+
+    anchor_boxes = np.concatenate(boxes_all, axis=1)
+    return anchor_boxes
diff --git a/picamera2/devices/imx500/postprocess_highernet.py b/picamera2/devices/imx500/postprocess_highernet.py
new file mode 100644
index 00000000..6cd97539
--- /dev/null
+++ b/picamera2/devices/imx500/postprocess_highernet.py
@@ -0,0 +1,562 @@
+"""
+Highernet postprocessing
+
+This code is based on multiple sources:
+https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
+https://github.com/princeton-vl/pose-ae-train
+https://github.com/yinguobing/facial-landmark-detection-hrnet
+"""
+
+from typing import Tuple
+
+import cv2
+import numpy as np
+
+try:
+    from munkres import Munkres
+except ImportError:
+    raise ImportError("Please install munkres first. `pip3 install --break-system-packages munkres`")
+
+default_joint_order = [0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16]
+
+
+def postprocess_higherhrnet(outputs: list[np.ndarray, np.ndarray],
+                            img_size,
+                            img_w_pad,
+                            img_h_pad,
+                            network_postprocess,
+                            num_joints=17,
+                            tag_per_joint=True,
+                            joint_order=default_joint_order,
+                            detection_threshold=0.3,
+                            max_num_people=30,
+                            nms_kernel=5,
+                            nms_padding=2,
+                            ignore_too_much=False,
+                            use_detection_val=True,
+                            tag_threshold=1.0,
+                            adjust=False,
+                            refine=False,
+                            input_image_size=(288, 384),
+                            output_shape=(144, 192)) -> Tuple[list[list], list, list[list]]:
+    all_preds = []
+    all_scores = []
+    if network_postprocess:
+        # outputs [[B, max_num_people, num_joints], [B, max_num_people, num_joints], [B, max_num_people, num_joints]]
+        grouped, scores = parse(network_outputs=[outputs[0][0, ...],
+                                                 outputs[1][0, ...],
+                                                 outputs[2][0, ...]],
+                                output_shape=output_shape,
+                                adjust=adjust,
+                                refine=refine,
+                                network_postprocess=network_postprocess,
+                                tag_per_joint=tag_per_joint,
+                                max_num_people=max_num_people,
+                                nms_kernel=nms_kernel,
+                                nms_padding=nms_padding,
+                                num_joints=num_joints,
+                                joint_order=joint_order,
+                                detection_threshold=detection_threshold,
+                                ignore_too_much=ignore_too_much,
+                                use_detection_val=use_detection_val,
+                                tag_threshold=tag_threshold)
+    else:
+        out0 = outputs[0][0]
+        out1 = outputs[1][0]
+
+        # postprocess:
+        # resize first output to 2nd output size
+        out0 = ResizeBilinear(out0, out1.shape[0], out1.shape[1])
+        # average heatmaps from both outputs
+        heatmaps = (out0[..., :17] + out1) / 2
+        tags = out0[..., 17:]
+        grouped, scores = parse(network_outputs=[heatmaps, tags],
+                                output_shape=output_shape,
+                                adjust=adjust,
+                                refine=refine,
+                                network_postprocess=network_postprocess,
+                                tag_per_joint=tag_per_joint,
+                                max_num_people=max_num_people,
+                                nms_kernel=nms_kernel,
+                                nms_padding=nms_padding,
+                                num_joints=num_joints,
+                                joint_order=joint_order,
+                                detection_threshold=detection_threshold,
+                                ignore_too_much=ignore_too_much,
+                                use_detection_val=use_detection_val,
+                                tag_threshold=tag_threshold)
+
+    # scale keypoints coordinates to input image size
+    scale_factor = (np.array(input_image_size) / output_shape).reshape((1, 1, 2))
+    for img_index in range(len(grouped)):
+        if grouped[img_index].shape[0] > 0:
+            # rescale to preprocessed input image size
+            grouped[img_index][:, :, :2] = grouped[img_index][:, :, :2] * scale_factor
+            # remove pad offset:
+            grouped[img_index][:, :, 0] = grouped[img_index][:, :, 0] - img_w_pad[0]
+            grouped[img_index][:, :, 1] = grouped[img_index][:, :, 1] - img_h_pad[0]
+            # rescale to original image size
+            resized_input_image = np.array(input_image_size) - np.array(
+                (sum(img_h_pad),
+                 sum(img_w_pad)))
+            s = (np.array(img_size) / resized_input_image).reshape((1, 1, 2))
+            grouped[img_index][:, :, :2] = grouped[img_index][:, :, :2] * s
+
+    # Calculate zero keypoint
+    zero_kpt = np.zeros((1, 4))
+    resized_input_image = np.array(input_image_size) - np.array(
+        (sum(img_h_pad),
+         sum(img_w_pad)))
+    s = (np.array(img_size) / resized_input_image).reshape((1, 1, 2))
+    zero_kpt[:, 0] = zero_kpt[:, 0] - img_w_pad[0]
+    zero_kpt[:, 1] = zero_kpt[:, 1] - img_h_pad[0]
+    zero_kpt[:, :2] = zero_kpt[:, :2] * s
+
+    all_preds.append(grouped)
+    all_scores.append(scores)
+
+    kpts = []
+    # one image, one iter
+    for idx, _kpts in enumerate(all_preds):
+        for idx_kpt, kpt in enumerate(_kpts[0]):
+            area = (np.max(kpt[:, 0]) - np.min(kpt[:, 0])) * (np.max(kpt[:, 1]) - np.min(kpt[:, 1]))
+            # kpt [17, 4]
+            kpt = processKeypoints(kpt)
+            kpts.append(
+                {
+                    'keypoints': kpt[:, 0:3],
+                    'score': all_scores[idx][idx_kpt],
+                    'tags': kpt[:, 3],
+                    'area': area
+                }
+            )
+    # _coco_keypoint_results_one_category_kernel
+    out_keypoints = []
+    out_scores = []
+    out_bbox = []
+
+    # for img_kpts in kpts:
+    img_kpts = kpts
+    if len(img_kpts) == 0:
+        return [], [], []
+
+    _key_points = np.array(
+        [img_kpts[k]['keypoints'] for k in range(len(img_kpts))]
+    )
+    key_points = np.zeros(
+        (_key_points.shape[0], num_joints * 3),
+        dtype=np.float32
+    )
+
+    for ipt in range(num_joints):
+        key_points[:, ipt * 3 + 0] = _key_points[:, ipt, 0]
+        key_points[:, ipt * 3 + 1] = _key_points[:, ipt, 1]
+        key_points[:, ipt * 3 + 2] = _key_points[:, ipt, 2]  # keypoints score.
+
+    for k in range(len(img_kpts)):
+        kpt = key_points[k].reshape((num_joints, 3))
+        # ignore zero kpts
+        mask = np.isin(kpt, zero_kpt)
+        kpt = np.where(mask, np.nan, kpt)
+        left_top = np.nanmin(kpt, axis=0)
+        right_bottom = np.nanmax(kpt, axis=0)
+
+        out_keypoints.append(list(key_points[k]))
+        out_scores.append(img_kpts[k]['score'])
+        out_bbox.append([left_top[1], left_top[0], right_bottom[1], right_bottom[0]])
+    return out_keypoints, out_scores, out_bbox
+
+
+def parse(network_outputs,
+          output_shape,
+          adjust=False,
+          refine=False,
+          network_postprocess=False,
+          tag_per_joint=17,
+          max_num_people=30,
+          nms_kernel=5,
+          nms_padding=2,
+          num_joints=17,
+          joint_order=default_joint_order,
+          detection_threshold=0.1,
+          ignore_too_much=False,
+          use_detection_val=True,
+          tag_threshold=1.0
+          ):
+    if network_postprocess:
+        tag_k, ind_k, val_k = network_outputs
+        x = ind_k % output_shape[1]
+        y = (ind_k / output_shape[1]).astype(ind_k.dtype)
+        ind_k = np.stack([x, y], axis=2)
+
+        topk_output_dict = {'tag_k': tag_k[np.newaxis, ...],
+                            'loc_k': ind_k[np.newaxis, ...],
+                            'val_k': val_k[np.newaxis, ...],
+                            }
+    else:
+        det, tag = network_outputs
+        # topk_output_dict
+        # {'tag_k': [num_images, max_num_people, num_joints],
+        # 'loc_k': [num_images, max_num_people, num_joints, 2],
+        # 'val_k': [num_images, max_num_people, num_joints]}
+        topk_output_dict = top_k(det=det,
+                                 tag=tag,
+                                 tag_per_joint=tag_per_joint,
+                                 max_num_people=max_num_people,
+                                 nms_kernel=nms_kernel,
+                                 nms_padding=nms_padding)
+    # ans [num_joints_detected, num_joints, 4]
+    ans = match(tag_k=topk_output_dict['tag_k'],
+                loc_k=topk_output_dict['loc_k'],
+                val_k=topk_output_dict['val_k'],
+                num_joints=num_joints,
+                joint_order=joint_order,
+                detection_threshold=detection_threshold,
+                max_num_people=max_num_people,
+                ignore_too_much=ignore_too_much,
+                use_detection_val=use_detection_val,
+                tag_threshold=tag_threshold)
+    if adjust:
+        # ans [[num_joints_detected, num_joints, 4]]
+        ans = adjust_func(ans, det[np.newaxis, ...])  # TODO support batch size > 1
+
+    scores = [i[:, 2].mean() for i in ans[0]]
+
+    if refine:
+        ans = ans[0]
+        # for every detected person
+        for _ in range(len(ans)):
+            # NotImplemented
+            if not tag_per_joint:
+                raise NotImplementedError
+
+        # ans [[num_joints_detected, num_joints, 4]]
+        ans = [ans]
+    return ans, scores
+
+
+def ResizeBilinear(img, new_height, new_width):
+    return cv2.resize(img, (new_width, new_height))
+
+
+def top_k(det,
+          tag,
+          tag_per_joint=17,
+          max_num_people=30,
+          nms_kernel=5,
+          nms_padding=2):
+    # det [144, 192, 17]
+    # tag [144, 192, 17]
+
+    # det [144, 192, 17]
+    det = nms(det,
+              nms_kernel=nms_kernel,
+              nms_padding=nms_padding)
+    # num_images 1
+    # h 144
+    # w 192
+    # num_joints 17
+    num_images, h, w, num_joints = (1,) + det.shape  # TODO: support multiple images (batch>1)
+
+    # det [num_images, h*w, num_joints]
+    det = det.reshape((num_images, -1, num_joints))
+    # val_k [num_images, max_num_people, num_joints]
+    val_k, ind = np_topk(det, max_num_people)
+
+    # tag [num_images, h*w, num_joints]
+    tag = tag.reshape((num_images, -1, num_joints))
+
+    # NotImplemented
+    if not tag_per_joint:
+        raise NotImplementedError
+        tag = tag.expand(-1, num_joints, -1, -1)
+
+    # tag_k [num_images, max_num_people, num_joints]
+    tag_k = np.zeros((num_images, max_num_people, num_joints))
+    for img in range(num_images):
+        for kp in range(num_joints):
+            tag_k[img, :, kp] = tag[img, ind[img, :, kp], kp]
+
+    x = ind % w
+    y = (ind / w).astype(ind.dtype)
+
+    # ind_k [num_images, max_num_people, num_joints, 2]
+    ind_k = np.stack([x, y], axis=3)
+
+    # {'tag_k': [num_images, max_num_people, num_joints],
+    # 'loc_k': [num_images, max_num_people, num_joints, 2],
+    # 'val_k': [num_images, max_num_people, num_joints]}
+    return {'tag_k': tag_k,
+            'loc_k': ind_k,
+            'val_k': val_k,
+            }
+
+
+def nms(det,
+        nms_kernel=5,
+        nms_padding=2):
+    # det [144, 192, 17]
+    # maxm [144, 192, 17]
+    maxm = np_max_pool(det, k=nms_kernel, p=nms_padding)
+    maxm = np.equal(maxm, det).astype(np.float32)
+    det = det * maxm
+    return det
+
+
+def np_max_pool(x,
+                k=5,
+                p=2,
+                p_value=0):
+    # x [144, 192, 17]
+    # k - kernel size (h, w)
+    # p - padding size (top, bottom, left, right)
+    if isinstance(k, int):
+        k = (k, k)
+    if isinstance(p, int):
+        p = ((p, p), (p, p), (0, 0))
+    elif isinstance(p, (list, tuple)) and len(p) == 2:
+        p = ((p[0], p[0]), (p[1], p[1]), (0, 0))
+
+    # y [148, 196, 17
+    y = np.pad(x, p)
+    out = np.concatenate(
+        [np.max(np.concatenate([y[ky:ky + y.shape[0] - k[0] + 1, kx:kx + y.shape[1] - k[1] + 1, c:c + 1]
+                                for ky in range(k[0])
+                                for kx in range(k[1])], 2), axis=2, keepdims=True) for c in range(y.shape[2])], 2)
+    # out [144, 192, 17]
+    return out
+
+
+def np_topk(x, k):
+    # x [1, 27648, 17]
+    # n_images 1
+    # n_keypoints 17
+    n_images, _, n_keypoints = x.shape
+    # vals [1, k, 17]
+    # inds [1, k, 17]
+    vals = np.zeros((n_images, k, n_keypoints), dtype=x.dtype)
+    inds = np.zeros((n_images, k, n_keypoints), dtype=np.int64)
+    for img in range(n_images):
+        for kp in range(n_keypoints):
+            # _inds [k]
+            _inds = np.argpartition(x[img, :, kp], -k)[-k:]
+            _inds = _inds[np.argsort(x[img, _inds, kp], )][::-1]
+            inds[img, :, kp] = _inds
+            vals[img, :, kp] = x[img, _inds, kp]
+    return vals, inds
+
+
+def match(tag_k,
+          loc_k,
+          val_k,
+          num_joints=17,
+          joint_order=default_joint_order,
+          detection_threshold=0.1,
+          max_num_people=30,
+          ignore_too_much=False,
+          use_detection_val=True,
+          tag_threshold=1.0):
+    def m(x):
+        return match_by_tag(inp=x,
+                            num_joints=num_joints,
+                            joint_order=joint_order,
+                            detection_threshold=detection_threshold,
+                            max_num_people=max_num_people,
+                            ignore_too_much=ignore_too_much,
+                            use_detection_val=use_detection_val,
+                            tag_threshold=tag_threshold)
+    return list(map(m, zip(tag_k, loc_k, val_k)))
+
+
+def match_by_tag(inp,
+                 num_joints=17,
+                 joint_order=default_joint_order,
+                 detection_threshold=0.1,
+                 max_num_people=30,
+                 ignore_too_much=False,
+                 use_detection_val=True,
+                 tag_threshold=1.0):
+    # tag_k [num_images, max_num_people, num_joints]
+    # loc_k [num_images, max_num_people, num_joints, 2]
+    # val_k [num_images, max_num_people, num_joints]
+    tag_k, loc_k, val_k = inp
+    # default_ [num_joints, 4]
+    default_ = np.zeros((num_joints, 3 + 1))  # tag_k.shape[2] assumed to be 1  # pytorch shape: (17, 4)
+
+    joint_dict = {}
+    tag_dict = {}
+    for i in range(num_joints):
+        idx = joint_order[i]
+
+        # tags [max_num_people, 1]
+        tags = tag_k[:, idx:idx + 1]
+        # joints [max_num_people, 4]
+        joints = np.concatenate((loc_k[:, idx, :], val_k[:, idx:idx + 1], tags), 1)
+        # mask [max_num_people]
+        mask = joints[:, 2] > detection_threshold
+        tags = tags[mask]
+        joints = joints[mask]
+
+        if joints.shape[0] == 0:
+            continue
+
+        if i == 0 or len(joint_dict) == 0:
+            for tag, joint in zip(tags, joints):
+                key = tag[0]
+                joint_dict.setdefault(key, np.copy(default_))[idx] = joint
+                tag_dict[key] = [tag]
+        else:
+            grouped_keys = list(joint_dict.keys())[:max_num_people]
+            grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys]
+
+            if ignore_too_much \
+                    and len(grouped_keys) == max_num_people:
+                continue
+
+            diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :]
+            diff_normed = np.linalg.norm(diff, ord=2, axis=2)
+            diff_saved = np.copy(diff_normed)
+
+            if use_detection_val:
+                diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
+
+            num_added = diff.shape[0]
+            num_grouped = diff.shape[1]
+
+            if num_added > num_grouped:
+                diff_normed = np.concatenate(
+                    (
+                        diff_normed,
+                        np.zeros((num_added, num_added - num_grouped)) + 1e10
+                    ),
+                    axis=1
+                )
+
+            pairs = py_max_match(diff_normed)
+            for row, col in pairs:
+                if (
+                        row < num_added
+                        and col < num_grouped
+                        and diff_saved[row][col] < tag_threshold
+                ):
+                    key = grouped_keys[col]
+                    joint_dict[key][idx] = joints[row]
+                    tag_dict[key].append(tags[row])
+                else:
+                    key = tags[row][0]
+                    joint_dict.setdefault(key, np.copy(default_))[idx] = \
+                        joints[row]
+                    tag_dict[key] = [tags[row]]
+
+    # ans [len(joint_dict), num_joints, 4]
+    ans = np.array([joint_dict[i] for i in joint_dict]).astype(np.float32)
+    return ans
+
+
+def py_max_match(scores):
+    m = Munkres()
+    tmp = m.compute(scores)
+    tmp = np.array(tmp).astype(np.int32)
+    return tmp
+
+
+def adjust_func(ans, det):
+    # ans [[num_joints_detected, num_joints, 4]]
+    # det [144, 192, 17]
+    for batch_id, people in enumerate(ans):
+        for people_id, i in enumerate(people):
+            for joint_id, joint in enumerate(i):
+                if joint[2] > 0:
+                    y, x = joint[0:2]
+                    xx, yy = int(x), int(y)
+                    # print(batch_id, joint_id, det[batch_id].shape)
+                    tmp = det[batch_id][..., joint_id]
+                    if tmp[xx, min(yy + 1, tmp.shape[1] - 1)] > tmp[xx, max(yy - 1, 0)]:
+                        y += 0.25
+                    else:
+                        y -= 0.25
+
+                    if tmp[min(xx + 1, tmp.shape[0] - 1), yy] > tmp[max(0, xx - 1), yy]:
+                        x += 0.25
+                    else:
+                        x -= 0.25
+                    ans[batch_id][people_id, joint_id, 0:2] = (y + 0.5, x + 0.5)
+    # ans [[num_joints_detected, num_joints, 4]]
+    return ans
+
+
+def refine_func(det, tag, keypoints):
+    # det [144, 192, 17]
+    # tag [144, 192, 17]
+    # keypoints [num_joints, 4]
+    if len(tag.shape) == 3:
+        # tag shape: (17, 128, 128, 1)
+        # tag [144, 192, 17, 1]
+        tag = tag[:, :, :, None]
+
+    tags = []
+    for i in range(keypoints.shape[0]):
+        if keypoints[i, 2] > 0:
+            # save tag value of detected keypoint
+            x, y = keypoints[i][:2].astype(np.int32)
+            tags.append(tag[y, x, i])
+
+    # mean tag of current detected people
+    prev_tag = np.mean(tags, axis=0)
+    ans = []
+
+    for i in range(keypoints.shape[0]):
+        # score of joints i at all position
+        tmp = det[:, :, i]
+        # distance of all tag values with mean tag of current detected people
+        tt = (((tag[:, :, i] - prev_tag[None, None, :]) ** 2).sum(axis=2) ** 0.5)
+        tmp2 = tmp - np.round(tt)
+
+        # find maximum position
+        y, x = np.unravel_index(np.argmax(tmp2), tmp.shape)
+        xx = x
+        yy = y
+        # detection score at maximum position
+        val = tmp[y, x]
+        # offset by 0.5
+        x += 0.5
+        y += 0.5
+
+        # add a quarter offset
+        if tmp[yy, min(xx + 1, tmp.shape[1] - 1)] > tmp[yy, max(xx - 1, 0)]:
+            x += 0.25
+        else:
+            x -= 0.25
+
+        if tmp[min(yy + 1, tmp.shape[0] - 1), xx] > tmp[max(0, yy - 1), xx]:
+            y += 0.25
+        else:
+            y -= 0.25
+
+        ans.append((x, y, val))
+    ans = np.array(ans)
+
+    if ans is not None:
+        for i in range(det.shape[2]):
+            # add keypoint if it is not detected
+            if ans[i, 2] > 0 and keypoints[i, 2] == 0:
+                # if ans[i, 2] > 0.01 and keypoints[i, 2] == 0:
+                keypoints[i, :2] = ans[i, :2]
+                keypoints[i, 2] = ans[i, 2]
+    # keypoints [num_joints_detected, num_joints, 4]
+    return keypoints
+
+
+def processKeypoints(keypoints):
+    # keypoints [17, 4]
+    tmp = keypoints.copy()
+    if keypoints[:, 2].max() > 0:
+        num_keypoints = keypoints.shape[0]
+        for i in range(num_keypoints):
+            tmp[i][0:3] = [
+                float(keypoints[i][0]),
+                float(keypoints[i][1]),
+                float(keypoints[i][2])
+            ]
+
+    return tmp
diff --git a/picamera2/devices/imx500/postprocess_nanodet.py b/picamera2/devices/imx500/postprocess_nanodet.py
new file mode 100644
index 00000000..b7f1d0b8
--- /dev/null
+++ b/picamera2/devices/imx500/postprocess_nanodet.py
@@ -0,0 +1,63 @@
+"""
+Nanodet postprocessing
+
+This code is based on:
+https://github.com/RangiLyu/nanodet
+"""
+
+from typing import Tuple
+
+import numpy as np
+
+from picamera2.devices.imx500.postprocess import combined_nms, softmax
+
+
+def postprocess_nanodet_detection(outputs,
+                                  conf: float = 0.0,
+                                  iou_thres: float = 0.65,
+                                  max_out_dets: int = 300) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    reg_max = 7
+    num_categories = 80
+    classes = outputs[..., :num_categories]
+    boxes = outputs[..., num_categories:]
+    classes = 1 / (1 + np.exp(-classes))  # sigmoid
+
+    # Extract feature map sizes
+    strides = [8, 16, 32, 64]
+    featmap_sizes = [(np.ceil(416 / stride), np.ceil(416 / stride)) for stride in strides]
+
+    # Generate priors
+    anchors = generate_anchors_NANODET(featmap_sizes, strides)
+
+    # Decode bboxes
+    batch = boxes.shape[0]
+    x = np.reshape(boxes, newshape=(batch, -1, 4, reg_max + 1))
+    x = softmax(x)
+    x = np.matmul(x, np.arange(0, reg_max + 1, 1, dtype=np.float32))
+    x = np.reshape(x, newshape=(batch, -1, 4))
+    distances = x * anchors[..., 2, None]
+
+    # Output Box format: [x_c, y_c, w, h]
+    w = distances[..., 0:1] + distances[..., 2:3]
+    h = distances[..., 1:2] + distances[..., 3:4]
+    x_c = anchors[..., 0:1] - distances[..., 0:1] + w / 2
+    y_c = anchors[..., 1:2] - distances[..., 1:2] + h / 2
+    boxes = np.concatenate([x_c, y_c, w, h], axis=2)
+
+    return combined_nms(boxes, classes, iou_thres, conf, max_out_dets)
+
+
+def generate_anchors_NANODET(featmap_sizes, strides):
+    anchors_list = []
+    for i, stride in enumerate(strides):
+        h, w = featmap_sizes[i]
+        x_range = np.arange(w) * stride
+        y_range = np.arange(h) * stride
+        y, x = np.meshgrid(y_range, x_range)
+        y = y.flatten()
+        x = x.flatten()
+        strides = np.ones_like(x) * stride
+        anchors = np.stack([y, x, strides, strides], axis=-1)
+        anchors = np.expand_dims(anchors, axis=0)
+        anchors_list.append(anchors)
+    return np.concatenate(anchors_list, axis=1)
diff --git a/picamera2/devices/imx500/postprocess_yolov5.py b/picamera2/devices/imx500/postprocess_yolov5.py
new file mode 100644
index 00000000..d9c5600f
--- /dev/null
+++ b/picamera2/devices/imx500/postprocess_yolov5.py
@@ -0,0 +1,244 @@
+"""
+Yolov5 postprocessing
+
+This code is based on:
+https://github.com/ultralytics/ultralytics
+"""
+from typing import List
+
+import cv2
+import numpy as np
+
+from picamera2.devices.imx500.postprocess import (
+    BoxFormat, convert_to_ymin_xmin_ymax_xmax_format, nms)
+
+default_anchors = [[10, 13, 16, 30, 33, 23],
+                   [30, 61, 62, 45, 59, 119],
+                   [116, 90, 156, 198, 373, 326]]
+default_strides = [8, 16, 32]
+
+
+def postprocess_yolov5_detection(outputs: List[np.ndarray],
+                                 model_input_shape=(640, 640),
+                                 num_categories=80,
+                                 min_wh=2,
+                                 max_wh=7680,
+                                 conf_thres: float = 0.001,
+                                 iou_thres: float = 0.65,
+                                 max_nms_dets: int = 5000,
+                                 max_out_dets: int = 1000):
+    H, W = model_input_shape
+    ############################################################
+    # Box decoding
+    ############################################################
+    outputs_decoded = box_decoding_yolov5n(tensors=outputs, num_categories=num_categories, H=H, W=W)
+
+    ############################################################
+    # Post processing for each input image
+    ############################################################
+    # Note: outputs_decoded shape is [Batch,num_anchors*Detections,(4+1+num_categories)]
+    post_processed_outputs = []
+    for _, x in enumerate(outputs_decoded):
+        # ----------------------------------------
+        # Filter by score and width-height
+        # ----------------------------------------
+        scores = x[..., 4]
+        wh = x[..., 2:4]
+        valid_indexs = (scores > conf_thres) & ((wh > min_wh).any(1)) & ((wh < max_wh).any(1))
+        x = x[valid_indexs]
+
+        # ----------------------------------------
+        # Taking Best class only
+        # ----------------------------------------
+        x[..., 5:] *= x[..., 4:5]  # compute confidence per class (class_score * object_score)
+        conf = np.max(x[:, 5:], axis=1, keepdims=True)
+        classes_id = np.argmax(x[:, 5:], axis=1, keepdims=True)
+
+        # Change boxes format from [x_c,y_c,w,h] to [y_min,x_min,y_max,x_max]
+        boxes = convert_to_ymin_xmin_ymax_xmax_format(x[..., :4], BoxFormat.XC_YC_W_H)
+        x = np.concatenate((boxes, conf, classes_id), axis=1)[conf.reshape(-1) > conf_thres]
+
+        # --------------------------- #
+        # NMS
+        # --------------------------- #
+        x = x[np.argsort(-x[:, 4])[:max_nms_dets]]  # sort by confidence from high to low
+        offset = x[..., 5:6] * np.maximum(H, W)
+        boxes_offset, scores = x[..., :4] + offset, x[..., 4]  # boxes with offset by class
+        valid_indexs = nms(dets=boxes_offset, scores=scores, iou_thres=iou_thres, max_out_dets=max_out_dets)
+        x = x[valid_indexs]
+
+        boxes = x[..., :4]
+        # --------------------------- #
+        # Classes process
+        # --------------------------- #
+        # convert classes from coco80 to coco91 to match labels
+        classes = coco80_to_coco91(x[..., 5]) if num_categories == 80 else x[..., 5]
+        classes -= 1
+
+        # --------------------------- #
+        # Scores
+        # --------------------------- #
+        scores = x[..., 4]
+
+        # Add result
+        post_processed_outputs.append({'boxes': boxes, 'classes': classes, 'scores': scores})
+
+    return post_processed_outputs[0]['boxes'], post_processed_outputs[0]['scores'], post_processed_outputs[0]['classes']
+
+
+def box_decoding_yolov5n(tensors,
+                         num_categories=80,
+                         H=640,
+                         W=640,
+                         anchors=default_anchors,
+                         strides=default_strides):
+    # Tensors box format: [x_c, y_c, w, h]
+    no = num_categories + 5  # number of outputs per anchor
+    nl = len(anchors)  # number of detection layers
+    na = len(anchors[0]) // 2  # number of anchors
+    anchor_grid = np.reshape(np.array(anchors), [nl, 1, -1, 1, 2])
+    anchor_grid = anchor_grid.astype(np.float32)
+    z = []
+    for i in range(nl):
+        ny, nx = H // strides[i], W // strides[i]
+        xv, yv = np.meshgrid(np.arange(nx), np.arange(ny))
+        grid = np.reshape(np.stack([xv, yv], 2), [1, 1, ny * nx, 2]).astype(np.float32)
+
+        y = tensors[i]
+        y = np.transpose(y, [0, 2, 1, 3])
+        xy = (y[..., 0:2] * 2 - 0.5 + grid) * strides[i]  # xy
+        wh = (y[..., 2:4] * 2) ** 2 * anchor_grid[i]
+
+        # Output box format: [x_c, y_c, w, h]
+        y = np.concatenate([xy, wh, y[..., 4:]], -1)
+        z.append(np.reshape(y, [-1, na * ny * nx, no]))
+
+    return np.concatenate(z, 1)
+
+
+# same as in preprocess but differs in h/w location
+def scale_boxes(boxes: np.ndarray, h_image: int, w_image: int, h_model: int, w_model: int,
+                preserve_aspect_ratio: bool) -> np.ndarray:
+    """
+    Scale and offset bounding boxes based on model output size and original image size.
+
+    Args:
+        boxes (numpy.ndarray): Array of bounding boxes in format [y_min, x_min, y_max, x_max].
+        h_image (int): Original image height.
+        w_image (int): Original image width.
+        h_model (int): Model output height.
+        w_model (int): Model output width.
+        preserve_aspect_ratio (bool): Whether to preserve image aspect ratio during scaling
+
+    Returns:
+        numpy.ndarray: Scaled and offset bounding boxes.
+    """
+    deltaH, deltaW = 0, 0
+    H, W = h_model, w_model
+    scale_H, scale_W = h_image / H, w_image / W
+
+    if preserve_aspect_ratio:
+        scale_H = scale_W = max(h_image / H, w_image / W)
+        H_tag = int(np.round(h_image / scale_H))
+        W_tag = int(np.round(w_image / scale_W))
+        deltaH, deltaW = int((H - H_tag) / 2), int((W - W_tag) / 2)
+
+    # Scale and offset boxes
+    boxes[..., 0] = (boxes[..., 0] - deltaH) * scale_H
+    boxes[..., 1] = (boxes[..., 1] - deltaW) * scale_W
+    boxes[..., 2] = (boxes[..., 2] - deltaH) * scale_H
+    boxes[..., 3] = (boxes[..., 3] - deltaW) * scale_W
+
+    # Clip boxes
+    boxes = clip_boxes(boxes, h_image, w_image)
+
+    return boxes
+
+
+# same as in preprocess but differs in h/w location
+def clip_boxes(boxes: np.ndarray, h: int, w: int) -> np.ndarray:
+    """
+    Clip bounding boxes to stay within the image boundaries.
+
+    Args:
+        boxes (numpy.ndarray): Array of bounding boxes in format [y_min, x_min, y_max, x_max].
+        h (int): Height of the image.
+        w (int): Width of the image.
+
+    Returns:
+        numpy.ndarray: Clipped bounding boxes.
+    """
+    boxes[..., 0] = np.clip(boxes[..., 0], a_min=0, a_max=h)
+    boxes[..., 1] = np.clip(boxes[..., 1], a_min=0, a_max=w)
+    boxes[..., 2] = np.clip(boxes[..., 2], a_min=0, a_max=h)
+    boxes[..., 3] = np.clip(boxes[..., 3], a_min=0, a_max=w)
+    return boxes
+
+
+def _normalize_coordinates(boxes, orig_width, orig_height, boxes_format):
+    """
+    Gets boxes in the original images values and normalize them to be between 0 to 1
+
+    :param boxes:
+    :param orig_width: original image width
+    :param orig_height: original image height
+    :param boxes_format: if the boxes are in XMIN_YMIN_W_H or YMIM_XMIN_YMAX_XMAX format
+    :return:
+    """
+    if len(boxes) == 0:
+        return boxes
+    elif _are_boxes_normalized(boxes):
+        return boxes
+    boxes[:, 0] = np.divide(boxes[:, 0], orig_height)
+    boxes[:, 1] = np.divide(boxes[:, 1], orig_width)
+    boxes[:, 2] = np.divide(boxes[:, 2], orig_height)
+    boxes[:, 3] = np.divide(boxes[:, 3], orig_width)
+    return boxes
+
+
+def _are_boxes_normalized(boxes):
+    if len(boxes) == 0:
+        return True  # it doesn't matter
+    if max(boxes[0]) > 1:
+        return False
+    return True
+
+
+def apply_normalization(boxes, orig_width, orig_height, boxes_format):
+    if _are_boxes_normalized(boxes):
+        return boxes
+    return _normalize_coordinates(boxes, orig_width, orig_height, boxes_format)
+
+
+# Locate at tutorials
+def coco80_to_coco91(x):  # converts 80-index to 91-index
+    coco91Indexs = np.array(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
+         35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+         63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90])
+
+    return coco91Indexs[x.astype(np.int32)]
+
+
+def yolov5n_preprocess(img):
+    # AspectPreservingResizeWithPad
+    new_height = 640
+    new_width = 640
+    pad_value = 114
+    resize_method = 3  # area
+    resize_ratio = max(img.shape[0] / new_height, img.shape[1] / new_width)
+    height_tag = int(np.round(img.shape[0] / resize_ratio))
+    width_tag = int(np.round(img.shape[1] / resize_ratio))
+    pad_values = ((int((new_height - height_tag) / 2), int((new_height - height_tag) / 2 + 0.5)),
+                  (int((new_width - width_tag) / 2), int((new_width - width_tag) / 2 + 0.5)),
+                  (0, 0))
+
+    resized_img = cv2.resize(img, (width_tag, height_tag), interpolation=resize_method)
+    padded_img = np.pad(resized_img, pad_values, constant_values=pad_value)
+
+    # Normalize
+    mean = 0
+    std = 255
+    normalized_img = (padded_img - mean) / std
+
+    return normalized_img
diff --git a/picamera2/devices/imx500/postprocess_yolov8.py b/picamera2/devices/imx500/postprocess_yolov8.py
new file mode 100644
index 00000000..91a1d63a
--- /dev/null
+++ b/picamera2/devices/imx500/postprocess_yolov8.py
@@ -0,0 +1,176 @@
+"""
+Yolov5 postprocessing
+
+This code is based on:
+https://github.com/ultralytics/ultralytics
+"""
+from typing import Tuple
+
+import cv2
+import numpy as np
+
+from picamera2.devices.imx500.postprocess import (
+    BoxFormat, combined_nms, combined_nms_seg,
+    convert_to_ymin_xmin_ymax_xmax_format, crop_mask, nms)
+
+
+def postprocess_yolov8_detection(outputs: Tuple[np.ndarray, np.ndarray],
+                                 conf: float = 0.3,
+                                 iou_thres: float = 0.7,
+                                 max_out_dets: int = 50) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Postprocess the outputs of a YOLOv8 model for object detection
+
+    Args:
+        outputs (Tuple[np.ndarray, np.ndarray]): Tuple containing the model outputs for bounding boxes and class predictions.
+        conf (float, optional): Confidence threshold for bounding box predictions. Default is 0.3
+        iou_thres (float, optional): IoU (Intersection over Union) threshold for Non-Maximum Suppression (NMS). Default is 0.7.
+        max_out_dets (int, optional): Maximum number of output detections to keep after NMS. Default is 50.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple containing the post-processed bounding boxes,
+            their corresponding scores, and categories.
+    """
+    feat_sizes = np.array([80, 40, 20])
+    stride_sizes = np.array([8, 16, 32])
+    a, s = (x.transpose() for x in make_anchors_yolo_v8(feat_sizes, stride_sizes, 0.5))
+
+    y_bb, y_cls = outputs
+    dbox = dist2bbox_yolo_v8(y_bb, a, xywh=True, dim=1) * s
+    detect_out = np.concatenate((dbox, y_cls), 1)
+
+    xd = detect_out.transpose([0, 2, 1])
+
+    return combined_nms(xd[..., :4], xd[..., 4:84], iou_thres, conf, max_out_dets)
+
+
+def postprocess_yolov8_keypoints(outputs: Tuple[np.ndarray, np.ndarray, np.ndarray],
+                                 conf: float = 0.3,
+                                 iou_thres: float = 0.7,
+                                 max_out_dets: int = 300) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Postprocess the outputs of a YOLOv8 model for object detection and pose estimation.
+
+    Args:
+        outputs (Tuple[np.ndarray, np.ndarray, np.ndarray]): Tuple containing the model outputs for bounding boxes,
+        class predictions, and keypoint predictions.
+        conf (float, optional): Confidence threshold for bounding box predictions. Default is 0.3
+        iou_thres (float, optional): IoU (Intersection over Union) threshold for Non-Maximum Suppression (NMS). Default is 0.7.
+        max_out_dets (int, optional): Maximum number of output detections to keep after NMS. Default is 300.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple containing the post-processed bounding boxes, their
+        corresponding scores, and keypoints.
+
+    """
+    kpt_shape = (17, 3)
+    feat_sizes = np.array([80, 40, 20])
+    stride_sizes = np.array([8, 16, 32])
+    a, s = (x.transpose() for x in make_anchors_yolo_v8(feat_sizes, stride_sizes, 0.5))
+
+    y_bb, y_cls, kpts = outputs
+    dbox = dist2bbox_yolo_v8(y_bb, a, xywh=True, dim=1) * s
+    detect_out = np.concatenate((dbox, y_cls), 1)
+    # additional part for pose estimation
+    ndim = kpt_shape[1]
+    pred_kpt = kpts.copy()
+    if ndim == 3:
+        pred_kpt[:, 2::3] = 1 / (1 + np.exp(-pred_kpt[:, 2::3]))  # sigmoid (WARNING: inplace .sigmoid_() Apple MPS bug)
+    pred_kpt[:, 0::ndim] = (pred_kpt[:, 0::ndim] * 2.0 + (a[0] - 0.5)) * s
+    pred_kpt[:, 1::ndim] = (pred_kpt[:, 1::ndim] * 2.0 + (a[1] - 0.5)) * s
+
+    x = np.concatenate([detect_out.transpose([2, 1, 0]).squeeze(), pred_kpt.transpose([2, 1, 0]).squeeze()], 1)
+    x = x[(x[:, 4] > conf)]
+    x = x[np.argsort(-x[:, 4])[:8400]]
+    x[..., :4] = convert_to_ymin_xmin_ymax_xmax_format(x[..., :4], BoxFormat.XC_YC_W_H)
+    boxes = x[..., :4]
+    scores = x[..., 4]
+
+    # Original post-processing part
+    valid_indexs = nms(boxes, scores, iou_thres=iou_thres, max_out_dets=max_out_dets)
+    x = x[valid_indexs]
+    nms_bbox = x[:, :4]
+    nms_scores = x[:, 4]
+    nms_kpts = x[:, 5:]
+
+    return nms_bbox, nms_scores, nms_kpts
+
+
+def postprocess_yolov8_inst_seg(outputs: Tuple[np.ndarray, np.ndarray, np.ndarray],
+                                conf: float = 0.001,
+                                iou_thres: float = 0.7,
+                                max_out_dets: int = 300) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    feat_sizes = np.array([80, 40, 20])
+    stride_sizes = np.array([8, 16, 32])
+    a, s = (x.transpose() for x in make_anchors_yolo_v8(feat_sizes, stride_sizes, 0.5))
+
+    y_bb, y_cls, ymask_weights, y_masks = outputs
+    dbox = dist2bbox_yolo_v8(y_bb, a, xywh=True, dim=1) * s
+    detect_out = np.concatenate((dbox, y_cls), 1)
+
+    xd = detect_out.transpose([0, 2, 1])
+    nms_bbox, nms_scores, nms_classes, ymask_weights = combined_nms_seg(xd[..., :4], xd[..., 4:84],
+                                                                        ymask_weights, iou_thres, conf, max_out_dets)[0]
+    if len(nms_scores) == 0:
+        final_masks = y_masks
+    else:
+        y_masks = y_masks.squeeze(0)
+        ymask_weights = ymask_weights.transpose(1, 0)
+        final_masks = np.tensordot(ymask_weights, y_masks, axes=([0], [0]))
+
+    return nms_bbox, nms_scores, nms_classes, final_masks
+
+
+def make_anchors_yolo_v8(feats, strides, grid_cell_offset=0.5):
+    """Generate anchors from features."""
+    anchor_points, stride_tensor = [], []
+    assert feats is not None
+    for i, stride in enumerate(strides):
+        h, w = feats[i], feats[i]
+        sx = np.arange(stop=w) + grid_cell_offset  # shift x
+        sy = np.arange(stop=h) + grid_cell_offset  # shift y
+        sy, sx = np.meshgrid(sy, sx, indexing='ij')
+        anchor_points.append(np.stack((sx, sy), -1).reshape((-1, 2)))
+        stride_tensor.append(np.full((h * w, 1), stride))
+    return np.concatenate(anchor_points), np.concatenate(stride_tensor)
+
+
+def dist2bbox_yolo_v8(distance, anchor_points, xywh=True, dim=-1):
+    """Transform distance(ltrb) to box(xywh or xyxy)."""
+    lt, rb = np.split(distance, 2, axis=dim)
+    x1y1 = anchor_points - lt
+    x2y2 = anchor_points + rb
+    if xywh:
+        c_xy = (x1y1 + x2y2) / 2
+        wh = x2y2 - x1y1
+        return np.concatenate((c_xy, wh), dim)  # xywh bbox
+    return np.concatenate((x1y1, x2y2), dim)  # xyxy bbox
+
+
+def pad_with_zeros(mask, roi, isp_output_size):
+    new_shape = (isp_output_size.width, isp_output_size.height, mask.shape[2])
+    padded_mask = np.zeros(new_shape, dtype=mask.dtype)
+    padded_mask[roi.x:roi.x + mask.shape[0], roi.y:roi.y + mask.shape[1], :] = mask
+    return padded_mask
+
+
+def process_masks(masks, boxes, roi, isp_output_size):
+    # Crop masks based on bounding boxes
+    masks = crop_mask(masks, boxes)
+
+    # Apply sigmoid function to normalize masks
+    masks = 1 / (1 + np.exp(-masks))
+    masks = np.transpose(masks, (2, 1, 0))  # Change to HWC format
+
+    # Resize masks to model input size
+    masks = cv2.resize(masks, (roi.height, roi.width), interpolation=cv2.INTER_LINEAR)
+
+    # Ensure masks are in the correct shape
+    masks = np.expand_dims(masks, -1) if len(masks.shape) == 2 else masks
+
+    masks = pad_with_zeros(masks, roi, isp_output_size)
+
+    # Ensure masks are in the correct shape
+    masks = np.expand_dims(masks, -1) if len(masks.shape) == 2 else masks
+    masks = np.transpose(masks, (2, 1, 0))  # Change back to CHW format
+    return masks
diff --git a/picamera2/devices/imx708/__init__.py b/picamera2/devices/imx708/__init__.py
new file mode 100644
index 00000000..6157cbb4
--- /dev/null
+++ b/picamera2/devices/imx708/__init__.py
@@ -0,0 +1 @@
+from .imx708 import IMX708
diff --git a/picamera2/devices/imx708/imx708.py b/picamera2/devices/imx708/imx708.py
new file mode 100644
index 00000000..281d0b54
--- /dev/null
+++ b/picamera2/devices/imx708/imx708.py
@@ -0,0 +1,67 @@
+import fcntl
+import os
+
+from v4l2 import VIDIOC_S_CTRL, v4l2_control
+
+from picamera2 import Picamera2
+
+HDR_CTRL_ID = 0x009a0915
+
+
+class IMX708:
+    def __init__(self, camera_num=None):
+        self.device_fd = None
+
+        camera_info = Picamera2.global_camera_info()
+        if camera_num is None:
+            camera_id = next((c['Id'] for c in camera_info if c['Model'] == 'imx708'), None)
+        else:
+            camera_id = next((c['Id'] for c in camera_info if c['Num'] == camera_num), None)
+
+        if camera_id is None:
+            raise RuntimeError('IMX708: Requested IMX708 camera device not be found')
+
+        for i in range(16):
+            test_dir = f'/sys/class/video4linux/v4l-subdev{i}/device'
+            module_dir = f'{test_dir}/driver/module'
+            id_dir = f'{test_dir}/of_node'
+            if os.path.exists(module_dir) and os.path.islink(module_dir) and 'imx708' in os.readlink(module_dir):
+                if os.path.islink(id_dir) and camera_id in os.readlink(id_dir):
+                    self.device_fd = open(f'/dev/v4l-subdev{i}', 'rb+', buffering=0)
+                    break
+
+        if self.device_fd is None:
+            raise RuntimeError('IMX708: Requested camera v4l2 device node not found')
+
+    def __del__(self):
+        self.close()
+
+    def close(self):
+        if self.device_fd:
+            self.device_fd.close()
+            self.device_fd = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, tb):
+        self.close()
+
+    def set_sensor_hdr_mode(self, enable: bool):
+        """
+        Set the sensor HDR mode (True/False) on the IMX708 device.
+
+        Note that after changing the HDR mode, you must
+        re-initialise the Picamera2 object to cache the updated sensor modes.
+        """
+        ctrl = v4l2_control()
+        ctrl.id = HDR_CTRL_ID
+        ctrl.value = int(enable)
+
+        try:
+            fcntl.ioctl(self.device_fd, VIDIOC_S_CTRL, ctrl)
+        except OSError as err:
+            print(f'IMX708: Unable to set HDR control in the device node: {err}')
+
+        # Must reset the camera manager so that cached sensor modes can be refreshed.
+        Picamera2._cm.reset()
diff --git a/picamera2/encoders/encoder.py b/picamera2/encoders/encoder.py
index d194f107..7a9954fa 100644
--- a/picamera2/encoders/encoder.py
+++ b/picamera2/encoders/encoder.py
@@ -3,6 +3,7 @@
 import threading
 from enum import Enum
 
+import av
 from libcamera import controls
 
 import picamera2.formats as formats
@@ -25,7 +26,25 @@ class Quality(Enum):
 
 
 class Encoder:
-    """Base class for encoders"""
+    """
+    Base class for encoders.
+
+    Mostly this defines the API for derived encoder classes, but it also handles optional audio encoding.
+    For audio, a separate thread is started, which encodes audio packets and forwards them to the
+    encoder's output object(s). This only work when the output object understands the audio stream,
+    meaning that (at the time of writing) this must be a PyavOutput (though you could send output there
+    via a CircularOutput2).
+
+    Additional audio parameters:
+    audio - set to True to enable audio encoding and output.
+    audio_input - list of parameters that is passed to PyAv.open to create the audio input.
+    audio_output - list of parameters passed to PyAv add_stream to define the audio codec and output stream.
+    audio_sync - value (in us) by which to advance the audio stream to better sync with the video.
+
+    Reasonable defaults are supplied so that applications can often just set the audio property to True.
+    The audio_input and audio_output parameters are passed directly to PyAV, so will accept whatever PyAV
+    understands.
+    """
 
     def __init__(self):
         """Initialises encoder"""
@@ -38,6 +57,18 @@ def __init__(self):
         self._name = None
         self._lock = threading.Lock()
         self.firsttimestamp = None
+        self.frame_skip_count = 1
+        self._skip_count = 0
+        self._output_lock = threading.Lock()
+        # Set to True to enable audio.
+        self.audio = False
+        # These parameters are passed to Pyav to open the input audio container.
+        self.audio_input = {'file': 'default', 'format': 'pulse'}
+        # THese parameters are passed to Pyav for creating the encoded audio output stream.
+        self.audio_output = {'codec_name': 'aac'}
+        self.audio_sync = -100000  # in us, so by default, delay audio by 100ms
+        self._audio_start = threading.Event()
+        self.frames_encoded = 0
 
     @property
     def running(self):
@@ -206,8 +237,15 @@ def encode(self, stream, request):
         :param request: Request
         :type request: request
         """
-        with self._lock:
-            self._encode(stream, request)
+        if self.audio:
+            self._audio_start.set()  # Signal the audio encode thread to start.
+        if self._skip_count == 0:
+            with self._lock:
+                if not self._running:
+                    return
+                self._encode(stream, request)
+        self._skip_count = (self._skip_count + 1) % self.frame_skip_count
+        self.frames_encoded += 1
 
     def _encode(self, stream, request):
         if isinstance(stream, str):
@@ -220,12 +258,27 @@ def start(self, quality=None):
         with self._lock:
             if self._running:
                 raise RuntimeError("Encoder already running")
+            self.frames_encoded = 0
             self._setup(quality)
             self._running = True
+            self.firsttimestamp = None
             for out in self._output:
                 out.start()
             self._start()
 
+            # Start the audio, if that's been requested.
+            if self.audio:
+                self._audio_input_container = av.open(**self.audio_input)
+                self._audio_input_stream = self._audio_input_container.streams.get(audio=0)[0]
+                self._audio_output_container = av.open("/dev/null", 'w', format="null")
+                self._audio_output_stream = self._audio_output_container.add_stream(**self.audio_output)
+                # Outputs that can handle audio need to be told about its existence.
+                for out in self._output:
+                    out._add_stream(self._audio_output_stream, **self.audio_output)
+                self._audio_thread = threading.Thread(target=self._audio_thread_func, daemon=True)
+                self._audio_start.clear()
+                self._audio_thread.start()  # audio thread will wait for the _audio_start event.
+
     def _start(self):
         pass
 
@@ -235,13 +288,18 @@ def stop(self):
                 raise RuntimeError("Encoder already stopped")
             self._running = False
             self._stop()
+            if self.audio:
+                self._audio_start.set()  # just in case it wasn't!
+                self._audio_thread.join()
+                self._audio_input_container.close()
+                self._audio_output_container.close()
             for out in self._output:
                 out.stop()
 
     def _stop(self):
         pass
 
-    def outputframe(self, frame, keyframe=True, timestamp=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Writes a frame
 
         :param frame: Frame
@@ -249,8 +307,9 @@ def outputframe(self, frame, keyframe=True, timestamp=None):
         :param keyframe: Whether frame is a keyframe or not, defaults to True
         :type keyframe: bool, optional
         """
-        for out in self._output:
-            out.outputframe(frame, keyframe, timestamp)
+        with self._output_lock:
+            for out in self._output:
+                out.outputframe(frame, keyframe, timestamp, packet, audio)
 
     def _setup(self, quality):
         pass
@@ -264,3 +323,33 @@ def _timestamp(self, request):
         else:
             timestamp_us = ts - self.firsttimestamp
         return timestamp_us
+
+    def _handle_audio_packet(self, audio_packet):
+        # Write out audio an packet, dealing with timestamp adjustments.
+        time_scale_factor = 1000000 * self._audio_output_stream.codec_context.time_base
+        delta = int(self.audio_sync / time_scale_factor)  # convert to audio time base
+        audio_packet.pts -= delta
+        audio_packet.dts -= delta
+        timestamp = int(audio_packet.pts * time_scale_factor)  # want this in us
+        if audio_packet.pts >= 0:
+            self.outputframe(None, True, timestamp, audio_packet, True)
+
+    def _audio_thread_func(self):
+        # Audio thread that fetches audio packets, encodes them and forwards them to the output.
+        # The output has to be able to understand audio, which means using a PyavOutput.
+        # _audio_start gets signalled when the first video frame is submitted for encode, which will hopefully
+        # keep the audio_sync adjustment more similar across different devices. Until that happens, though,
+        # we must keep consuming and discarding the audio.
+        for _ in self._audio_input_container.decode(self._audio_input_stream):
+            if self._audio_start.isSet():
+                break
+
+        for audio_frame in self._audio_input_container.decode(self._audio_input_stream):
+            if not self._running:
+                break
+            for audio_packet in self._audio_output_stream.encode(audio_frame):
+                self._handle_audio_packet(audio_packet)
+
+        # Flush out any remaining audio packets.
+        for audio_packet in self._audio_output_stream.encode(None):
+            self._handle_audio_packet(audio_packet)
diff --git a/picamera2/encoders/h264_encoder.py b/picamera2/encoders/h264_encoder.py
index 900daca1..532f175d 100644
--- a/picamera2/encoders/h264_encoder.py
+++ b/picamera2/encoders/h264_encoder.py
@@ -84,6 +84,10 @@ def _start(self):
             self._controls += [(V4L2_CID_MPEG_VIDEO_H264_MIN_QP, self.qp)]
             self._controls += [(V4L2_CID_MPEG_VIDEO_H264_MAX_QP, self.qp)]
 
+        # The output objects may need to know what kind of stream this is.
+        for out in self._output:
+            out._add_stream("video", "h264")
+
         super()._start()
 
     def _setup(self, quality):
diff --git a/picamera2/encoders/libav_h264_encoder.py b/picamera2/encoders/libav_h264_encoder.py
index 454e1da1..2825a163 100644
--- a/picamera2/encoders/libav_h264_encoder.py
+++ b/picamera2/encoders/libav_h264_encoder.py
@@ -1,11 +1,13 @@
 """This is a base class for a multi-threaded software encoder."""
 
+import collections
 import time
 from fractions import Fraction
 from math import sqrt
 
 import av
 
+import picamera2.platform as Platform
 from picamera2.encoders.encoder import Encoder, Quality
 
 from ..request import MappedArray
@@ -28,6 +30,24 @@ def __init__(self, bitrate=None, repeat=True, iperiod=30, framerate=30, qp=None,
         self.drop_final_frames = False
         self.threads = 0  # means "you choose"
         self._lasttimestamp = None
+        self._use_hw = False
+        self._request_release_delay = 1
+        self._request_release_queue = None
+
+    @property
+    def use_hw(self):
+        """Whether hardware encode will be used (can be set to True only for VC4 platforms)."""
+        return self._use_hw
+
+    @use_hw.setter
+    def use_hw(self, value):
+        """Set this property in order to get libav to use the V4L2 hardware encoder (VC4 platforms only)."""
+        if value:
+            if Platform.get_platform() == Platform.Platform.VC4:
+                self._use_hw = True
+                self._codec = "h264_v4l2m2m"
+            else:
+                print("Warning: use_hw has no effect on non-VC4 platforms")
 
     def _setup(self, quality):
         # If an explicit quality was specified, use it, otherwise try to preserve any bitrate/qp
@@ -57,18 +77,24 @@ def _start(self):
         self._stream.height = self.height
         self._stream.pix_fmt = "yuv420p"
 
+        for out in self._output:
+            out._add_stream(self._stream, self._codec, rate=self.framerate)
+
         preset = "ultrafast"
         if self.profile is not None:
             if not isinstance(self.profile, str):
                 raise RuntimeError("Profile should be a string value")
-            # Much more helpful to compare profile names case insensitively!
-            available_profiles = {k.lower(): v for k, v in self._stream.codec.profiles.items()}
-            profile = self.profile.lower()
-            if profile not in available_profiles:
+            # Find the right profile name, ignoring case.
+            profile = None
+            for available_profile in self._stream.profiles:
+                if self.profile.lower() == available_profile.lower():
+                    profile = available_profile
+                    break
+            if not profile:
                 raise RuntimeError("Profile " + self.profile + " not recognised")
-            self._stream.codec_context.profile = available_profiles[profile]
+            self._stream.profile = profile
             # The "ultrafast" preset always produces baseline, so:
-            if "baseline" not in profile:
+            if "baseline" not in profile.lower():
                 preset = "superfast"
 
         if self.bitrate is not None:
@@ -98,6 +124,8 @@ def _start(self):
                         "XRGB8888": "bgra"}
         self._av_input_format = FORMAT_TABLE[self._format]
 
+        self._request_release_queue = collections.deque()
+
     def _stop(self):
         if not self.drop_final_frames:
             # Annoyingly, libav still has lots of encoded frames internally which we must flush
@@ -110,14 +138,20 @@ def _stop(self):
                     if delay_us > 0:
                         time.sleep(delay_us / 1000000)
                 self._lasttimestamp = (time.monotonic_ns(), packet.pts)
-                self.outputframe(bytes(packet), packet.is_keyframe, timestamp=packet.pts)
+                self.outputframe(bytes(packet), packet.is_keyframe, timestamp=packet.pts, packet=packet)
+        while self._request_release_queue:
+            self._request_release_queue.popleft().release()
         self._container.close()
 
     def _encode(self, stream, request):
+        request.acquire()
+        self._request_release_queue.append(request)
         timestamp_us = self._timestamp(request)
         with MappedArray(request, stream) as m:
-            frame = av.VideoFrame.from_ndarray(m.array, format=self._av_input_format, width=self.width)
+            frame = av.VideoFrame.from_numpy_buffer(m.array, format=self._av_input_format, width=self.width)
             frame.pts = timestamp_us
             for packet in self._stream.encode(frame):
                 self._lasttimestamp = (time.monotonic_ns(), packet.pts)
-                self.outputframe(bytes(packet), packet.is_keyframe, timestamp=packet.pts)
+                self.outputframe(bytes(packet), packet.is_keyframe, timestamp=packet.pts, packet=packet)
+        while len(self._request_release_queue) > self._request_release_delay:
+            self._request_release_queue.popleft().release()
diff --git a/picamera2/encoders/libav_mjpeg_encoder.py b/picamera2/encoders/libav_mjpeg_encoder.py
index 1b80efe2..c6e21c59 100644
--- a/picamera2/encoders/libav_mjpeg_encoder.py
+++ b/picamera2/encoders/libav_mjpeg_encoder.py
@@ -1,5 +1,6 @@
 """This is a base class for a multi-threaded software encoder."""
 
+import collections
 from fractions import Fraction
 
 import av
@@ -21,6 +22,8 @@ def __init__(self, bitrate=None, repeat=True, iperiod=30, framerate=30, qp=None)
         self.iperiod = iperiod
         self.framerate = framerate
         self.qp = qp
+        self._request_release_delay = 1
+        self._request_release_queue = None
 
     def _setup(self, quality):
         # If an explicit quality was specified, use it, otherwise try to preserve any bitrate/qp
@@ -40,12 +43,15 @@ def _start(self):
         self._stream = self._container.add_stream(self._codec, rate=self.framerate)
 
         self._stream.codec_context.thread_count = 8
-        self._stream.codec_context.thread_type = av.codec.context.ThreadType.FRAME
+        self._stream.codec_context.thread_type = av.codec.context.ThreadType.FRAME  # noqa
 
         self._stream.width = self.width
         self._stream.height = self.height
         self._stream.pix_fmt = "yuv420p"
 
+        for out in self._output:
+            out._add_stream(self._stream, self._codec, rate=self.framerate)
+
         # This is all rather arbitrary but comes out with a vaguely plausible a quantiser. I
         # found that the sqrt of the quantiser times the bitrate was approximately constant with
         # the value 64000000 for a 1080p30 stream, though obviously it will depend on content,
@@ -59,7 +65,11 @@ def _start(self):
         self._stream.codec_context.qmin = self.qp
         self._stream.codec_context.qmax = self.qp
         self._stream.codec_context.color_range = 2  # JPEG (full range)
-        self._stream.codec_context.flags |= av.codec.context.Flags.QSCALE
+        try:
+            # "qscale" is now correct, but some older versions used "QSCALE"
+            self._stream.codec_context.flags |= av.codec.context.Flags.qscale  # noqa
+        except AttributeError:
+            self._stream.codec_context.flags |= av.codec.context.Flags.QSCALE  # noqa
 
         self._stream.codec_context.time_base = Fraction(1, 1000000)
 
@@ -70,15 +80,21 @@ def _start(self):
                         "XRGB8888": "bgra"}
         self._av_input_format = FORMAT_TABLE[self._format]
 
+        self._request_release_queue = collections.deque()
+
     def _stop(self):
         for packet in self._stream.encode():
-            self.outputframe(bytes(packet), packet.is_keyframe, timestamp=packet.pts)
+            self.outputframe(bytes(packet), packet.is_keyframe, timestamp=packet.pts, packet=packet)
+        while self._request_release_queue:
+            self._request_release_queue.popleft().release()
         self._container.close()
 
     def _encode(self, stream, request):
         timestamp_us = self._timestamp(request)
         with MappedArray(request, stream) as m:
-            frame = av.VideoFrame.from_ndarray(m.array, format=self._av_input_format, width=self.width)
+            frame = av.VideoFrame.from_numpy_buffer(m.array, format=self._av_input_format, width=self.width)
             frame.pts = timestamp_us
             for packet in self._stream.encode(frame):
-                self.outputframe(bytes(packet), packet.is_keyframe, timestamp=packet.pts)
+                self.outputframe(bytes(packet), packet.is_keyframe, timestamp=packet.pts, packet=packet)
+        while len(self._request_release_queue) > self._request_release_delay:
+            self._request_release_queue.popleft().release()
diff --git a/picamera2/encoders/mjpeg_encoder.py b/picamera2/encoders/mjpeg_encoder.py
index 7dd57226..f049590a 100644
--- a/picamera2/encoders/mjpeg_encoder.py
+++ b/picamera2/encoders/mjpeg_encoder.py
@@ -34,3 +34,10 @@ def _setup(self, quality):
             actual_complexity = self.width * self.height * getattr(self, "framerate", 30)
             reference_bitrate = BITRATE_TABLE[quality] * 1000000
             self.bitrate = int(reference_bitrate * actual_complexity / reference_complexity)
+
+    def _start(self):
+        # The output objects may need to know what kind of stream this is.
+        for out in self._output:
+            out._add_stream("video", "mjpeg", rate=30)  # seem to need a rate to prevent timestamp warnings
+
+        super()._start()
diff --git a/picamera2/encoders/v4l2_encoder.py b/picamera2/encoders/v4l2_encoder.py
index 5f3944dd..5f9e5c38 100644
--- a/picamera2/encoders/v4l2_encoder.py
+++ b/picamera2/encoders/v4l2_encoder.py
@@ -189,6 +189,9 @@ def thread_poll(self, buf_available):
                 # few hundred ms than wait forever. Note that self.buf_frame.qsize()
                 # frames (usually just 1) are getting dropped here, and won't be
                 # encoded. I've only ever seen this on a Pi Zero.
+                while self.buf_frame.qsize() > 0:
+                    queue_item = self.buf_frame.get()
+                    queue_item.release()
                 break
 
             for _, event in events:
diff --git a/picamera2/job.py b/picamera2/job.py
index 6b1865c9..ad5fa399 100644
--- a/picamera2/job.py
+++ b/picamera2/job.py
@@ -1,4 +1,4 @@
-from concurrent.futures import Future
+from concurrent.futures import CancelledError, Future
 
 
 class Job:
@@ -77,3 +77,11 @@ def get_result(self, timeout=None):
         if necessary for the job to complete.
         """
         return self._future.result(timeout=timeout)
+
+    def cancel(self):
+        """Mark this job as cancelled, so that requesting the result raises a CancelledError.
+
+        User code should not call this because it won't unschedule the job, i.e. remove it
+        from the job queue. Use Picamera2.cancel_all_and_flush() to cancel and clear all jobs.
+        """
+        self._future.set_exception(CancelledError)
diff --git a/picamera2/outputs/__init__.py b/picamera2/outputs/__init__.py
index 1c07c56d..34c06358 100644
--- a/picamera2/outputs/__init__.py
+++ b/picamera2/outputs/__init__.py
@@ -1,4 +1,6 @@
 from .circularoutput import CircularOutput
+from .circularoutput2 import CircularOutput2
 from .ffmpegoutput import FfmpegOutput
 from .fileoutput import FileOutput
 from .output import Output
+from .pyavoutput import PyavOutput
diff --git a/picamera2/outputs/circularoutput.py b/picamera2/outputs/circularoutput.py
index a8ef5b4f..816ecea1 100644
--- a/picamera2/outputs/circularoutput.py
+++ b/picamera2/outputs/circularoutput.py
@@ -40,7 +40,7 @@ def buffersize(self, value):
             self._buffersize = value
             self._circular = collections.deque(maxlen=value)
 
-    def outputframe(self, frame, keyframe=True, timestamp=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Write frame to circular buffer
 
         :param frame: Frame
@@ -50,6 +50,8 @@ def outputframe(self, frame, keyframe=True, timestamp=None):
         :param timestamp: Timestamp of frame
         :type timestamp: int
         """
+        if audio:
+            raise RuntimeError("CircularOutput does not support audio")
         with self._lock:
             if self._buffersize == 0:
                 return
diff --git a/picamera2/outputs/circularoutput2.py b/picamera2/outputs/circularoutput2.py
new file mode 100644
index 00000000..cdf363f3
--- /dev/null
+++ b/picamera2/outputs/circularoutput2.py
@@ -0,0 +1,136 @@
+"""Circular buffer"""
+
+import collections
+from threading import Lock
+
+from .output import Output
+
+
+class CircularOutput2(Output):
+    """
+    Circular buffer implementation, much like CircularOutput, but for general outputs.
+
+    This means it can be used in conjunction with, for example, a PyavOutput to create time-shifted
+    recordings of both video and audio straight to an mp4 file.
+
+    Once the CircularOutput2 has been started, use the open_output method to start start recording
+    a new output, and use close_output when finished. If the output has not been closed when the
+    circular buffer is stopped, then the remainder of the buffer will be flush into the output.
+    """
+
+    def __init__(self, pts=None, buffer_duration_ms=5000):
+        """Create a CircularOutput2."""
+        super().__init__(pts=pts)
+        # A note on locking. The lock is principally to protect outputframe, which is called by
+        # the background encoder thread. Applications are going to call things like open_output,
+        # close_output, start and stop. These only grab that lock for a short period of time to
+        # manipulate _output_available, which controls whether outputframe will do anything.
+        # THe application API does not have it's own lock, because there doesn't seem to be a
+        # need to drive it from different threads (though we could add one if necessary).
+        self._lock = Lock()
+        if buffer_duration_ms < 0:
+            raise RuntimeError("buffer_duration_ms may not be negative")
+        self._buffer_duration_ms = buffer_duration_ms
+        self._circular = collections.deque()
+        self._output = None
+        self._output_available = False
+        self._streams = []
+
+    @property
+    def buffer_duration_ms(self):
+        """Returns duration of the buffer in ms"""
+        return self._buffer_duration_ms
+
+    @buffer_duration_ms.setter
+    def buffer_duration_ms(self, value):
+        """Set buffer duration in ms, can even be changed dynamically"""
+        with self._lock:
+            self._buffer_duration_ms = value
+
+    def open_output(self, output):
+        """Open a new output object and start writing to it."""
+        if self._output:
+            raise RuntimeError("Underlying output must be closed first")
+
+        self._output = output
+        self._output.start()
+        # Some outputs (PyavOutput) may need to know about the encoder's streams.
+        for encoder_stream, codec, kwargs in self._streams:
+            output._add_stream(encoder_stream, codec, **kwargs)
+
+        # Now it's OK for the background thread to output frames.
+        with self._lock:
+            self._output_available = True
+            self._first_frame = True
+
+    def close_output(self):
+        """Close an output object."""
+        if not self._output:
+            raise RuntimeError("No underlying output has been opened")
+
+        # After this, we guarantee that the background thread will never use the output.
+        with self._lock:
+            self._output_available = False
+
+        self._output.stop()
+        self._output = None
+
+    def _get_frame(self):
+        # Fetch the next frame to be saved to the underlying output.
+        if not self._circular:
+            return
+        if not self._first_frame:
+            return self._circular.popleft()
+        # Must skip ahead to the first I frame if we haven't seen one yet.
+        while self._circular:
+            entry = self._circular.popleft()
+            _, key_frame, _, _, audio = entry
+            # If there is audio, all audio frames are likely to be keyframes, so we must ignore them when
+            # deciding when the streams can resume - only the video counts.
+            if key_frame and not audio:
+                self._first_frame = False
+                return entry
+
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
+        """Write frame to circular buffer"""
+        with self._lock:
+            if self._buffer_duration_ms == 0 or not self.recording:
+                return
+            self._circular.append((frame, keyframe, timestamp, packet, audio))
+            # Discard any expired buffer entries.
+            while timestamp - self._circular[0][2] > self._buffer_duration_ms * 1000:
+                self._circular.popleft()
+
+            if self._output_available:
+                # Actually write this to the underlying output.
+                entry = self._get_frame()
+                if entry:
+                    self._output.outputframe(*entry)
+
+    def start(self):
+        """Start recording in the circular buffer."""
+        with self._lock:
+            if self.recording:
+                raise RuntimeError("Circular output is running")
+            self.recording = True
+
+    def stop(self):
+        """Close file handle and stop recording"""
+        with self._lock:
+            if not self.recording:
+                raise RuntimeError("Circular output was not started")
+            self._recording = False
+            self._output_available = False
+
+        # Flush out anything remaining in the buffer if the underlying output is still going
+        # when we stop.
+        if self._output:
+            while (entry := self._get_frame()):
+                self._output.outputframe(*entry)
+            self._output.stop()
+            self._output = None
+
+    def _add_stream(self, encoder_stream, codec_name, **kwargs):
+        # Notice the PyavOutput of a stream that will be sending it packets to write out. It will need
+        # to forward these whenever a new underlying output is opened.
+        self._streams.append((encoder_stream, codec_name, kwargs))
diff --git a/picamera2/outputs/ffmpegoutput.py b/picamera2/outputs/ffmpegoutput.py
index e6938d58..1f35c763 100644
--- a/picamera2/outputs/ffmpegoutput.py
+++ b/picamera2/outputs/ffmpegoutput.py
@@ -1,3 +1,4 @@
+import gc
 import signal
 import subprocess
 
@@ -33,12 +34,13 @@ class FfmpegOutput(Output):
     """
 
     def __init__(self, output_filename, audio=False, audio_device="default", audio_sync=-0.3,
-                 audio_samplerate=48000, audio_codec="aac", audio_bitrate=128000, pts=None):
+                 audio_samplerate=48000, audio_codec="aac", audio_bitrate=128000, audio_filter=None, pts=None):
         super().__init__(pts=pts)
         self.ffmpeg = None
         self.output_filename = output_filename
         self.audio = audio
         self.audio_device = audio_device
+        self.audio_filter = audio_filter
         self.audio_sync = audio_sync
         self.audio_samplerate = audio_samplerate
         self.audio_codec = audio_codec
@@ -71,6 +73,8 @@ def start(self):
                            '-i', self.audio_device]
             audio_codec = ['-b:a', str(self.audio_bitrate),
                            '-c:a', self.audio_codec]
+            if self.audio_filter:  # Check if audio_filter is not empty or None
+                audio_codec.extend(['-af', self.audio_filter])
 
         command = ['ffmpeg'] + general_options + audio_input + video_input + \
             audio_codec + video_codec + self.output_filename.split()
@@ -93,8 +97,12 @@ def stop(self):
                 except Exception:
                     pass
             self.ffmpeg = None
+            # This seems to be necessary to get the subprocess to clean up fully.
+            gc.collect()
 
-    def outputframe(self, frame, keyframe=True, timestamp=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
+        if audio:
+            raise RuntimeError("FfmpegOutput does not support audio packets from Picamera2")
         if self.recording and self.ffmpeg:
             # Handle the case where the FFmpeg prcoess has gone away for reasons of its own.
             try:
diff --git a/picamera2/outputs/fileoutput.py b/picamera2/outputs/fileoutput.py
index 872c2cc6..0f0b8921 100644
--- a/picamera2/outputs/fileoutput.py
+++ b/picamera2/outputs/fileoutput.py
@@ -72,7 +72,7 @@ def connectiondead(self, _callback):
         else:
             raise RuntimeError("Must pass callback function or None")
 
-    def outputframe(self, frame, keyframe=True, timestamp=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Outputs frame from encoder
 
         :param frame: Frame
@@ -82,6 +82,8 @@ def outputframe(self, frame, keyframe=True, timestamp=None):
         :param timestamp: Timestamp of frame
         :type timestamp: int
         """
+        if audio:
+            raise RuntimeError("Fileoutput does not support audio")
         if self._fileoutput is not None and self.recording:
             if self._firstframe:
                 if not keyframe:
diff --git a/picamera2/outputs/output.py b/picamera2/outputs/output.py
index 7e55368b..3fa185b9 100644
--- a/picamera2/outputs/output.py
+++ b/picamera2/outputs/output.py
@@ -22,7 +22,7 @@ def stop(self):
         """Stop recording"""
         self.recording = False
 
-    def outputframe(self, frame, keyframe=True, timestamp=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         """Outputs frame from encoder
 
         :param frame: Frame
@@ -57,3 +57,7 @@ def ptsoutput(self, file):
                 self._ptsoutput = open(file, "w")
             else:
                 self._ptsoutput = file
+
+    def _add_stream(self, encoder_stream, *args, **kwargs):
+        # Some output types might need to know about an encoder's output stream.
+        pass
diff --git a/picamera2/outputs/pyavoutput.py b/picamera2/outputs/pyavoutput.py
new file mode 100644
index 00000000..54639632
--- /dev/null
+++ b/picamera2/outputs/pyavoutput.py
@@ -0,0 +1,91 @@
+from fractions import Fraction
+
+import av
+
+from .output import Output
+
+
+class PyavOutput(Output):
+    """
+    The PyavOutput class outputs an encoded video, and optionally audio, stream using PyAV.
+
+    PyAv is a Python interface to libav, used by FFmpeg, and therefore can accept many different output
+    types and destinations, in the same way as FFmpeg.
+
+    The PyavOutput calls directly into libav through its Python layer, and does not pipe encoded frames
+    out to a separate process like the FfmpegOutput. The PyavOutput integration means we can pass precise
+    timestamps, and are not subject to the hazards of FFmpeg re-timestamping everything as it gets piped
+    back in.
+    """
+
+    def __init__(self, output_name, format=None, pts=None):
+        super().__init__(pts=pts)
+        self._output_name = output_name
+        self._format = format
+        self._streams = {}
+        self._container = None
+        # A user can set this to get notifications of failures.
+        self.error_callback = None
+
+    def _add_stream(self, encoder_stream, codec_name, **kwargs):
+        # The output container that does the muxing needs to know about the streams for which packets
+        # will be sent to it. It literally needs to copy them for the output container.
+        stream = self._container.add_stream(codec_name, **kwargs)
+
+        if codec_name == "mjpeg":
+            # Well, this is nasty. MJPEG seems to need this.
+            stream.codec_context.color_range = 2  # JPEG (full range)
+
+        self._streams[encoder_stream] = stream
+
+    def start(self):
+        """Start the PyavOutput."""
+        self._container = av.open(self._output_name, "w", format=self._format)
+        super().start()
+
+    def stop(self):
+        """Stop the PyavOutput."""
+        super().stop()
+        if self._container:
+            try:
+                self._container.close()
+            except Exception:
+                pass
+            self._container = None
+            self._streams = {}
+
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
+        """Output an encoded frame using PyAv."""
+        if self.recording and self._container:
+            orig_stream = None
+            # We must make a packet that looks like it came from our own container's version of the stream.
+            if not packet:
+                # No packet present. It must have come from a video encoder that isn't using libav, so make one up.
+                packet = av.Packet(frame)
+                packet.dts = timestamp
+                packet.pts = timestamp
+                packet.time_base = Fraction(1, 1000000)
+                packet.stream = self._streams["video"]
+            else:
+                # We can perform a switcheroo on the packet's stream, swapping the encoder's version for ours!
+                orig_stream = packet.stream
+                if orig_stream not in self._streams:
+                    raise RuntimeError("Stream not found in PyavOutput")
+                packet.stream = self._streams[orig_stream]
+
+            try:
+                self._container.mux(packet)
+            except Exception as e:
+                try:
+                    self._container.close()
+                except Exception:
+                    pass
+                self._container = None
+                if self.error_callback:
+                    self.error_callback(e)
+
+            # Put the original stream back, just in case the encoder has multiple outputs and will pass
+            # it to each one.
+            packet.stream = orig_stream
+
+            self.outputtimestamp(timestamp)
diff --git a/picamera2/picamera2.py b/picamera2/picamera2.py
index caff2cf0..42373447 100644
--- a/picamera2/picamera2.py
+++ b/picamera2/picamera2.py
@@ -2,6 +2,7 @@
 """picamera2 main class"""
 
 import atexit
+import contextlib
 import json
 import logging
 import os
@@ -55,13 +56,24 @@ def __init__(self):
         self.running = False
         self.cameras = {}
         self._lock = threading.Lock()
+        self._cms = None
 
     def setup(self):
-        self.cms = libcamera.CameraManager.singleton()
         self.thread = threading.Thread(target=self.listen, daemon=True)
         self.running = True
         self.thread.start()
 
+    @property
+    def cms(self):
+        if self._cms is None:
+            self._cms = libcamera.CameraManager.singleton()
+        return self._cms
+
+    def reset(self):
+        with self._lock:
+            self._cms = None
+            self._cms = libcamera.CameraManager.singleton()
+
     def add(self, index, camera):
         with self._lock:
             self.cameras[index] = camera
@@ -77,7 +89,7 @@ def cleanup(self, index):
                 flag = True
         if flag:
             self.thread.join()
-            self.cms = None
+            self._cms = None
 
     def listen(self):
         sel = selectors.DefaultSelector()
@@ -90,7 +102,7 @@ def listen(self):
                 callback()
 
         sel.unregister(self.cms.event_fd)
-        self.cms = None
+        self._cms = None
 
     def handle_request(self, flushid=None):
         """Handle requests
@@ -208,7 +220,7 @@ def describe_camera(cam, num):
             info["Id"] = cam.id
             info["Num"] = num
             return info
-        cameras = [describe_camera(cam, i) for i, cam in enumerate(libcamera.CameraManager.singleton().cameras)]
+        cameras = [describe_camera(cam, i) for i, cam in enumerate(Picamera2._cm.cms.cameras)]
         # Sort alphabetically so they are deterministic, but send USB cams to the back of the class.
         return sorted(cameras, key=lambda cam: ("/usb" not in cam['Id'], cam['Id']), reverse=True)
 
@@ -258,12 +270,12 @@ def __init__(self, camera_num=0, verbose_console=None, tuning=None, allocator=No
             _log.debug(f"{self.camera_manager}")
             # We deliberately make raw streams with no size so that it will be filled in
             # later once the main stream size has been set.
-            self.preview_configuration = self.create_preview_configuration()
-            self.preview_configuration.enable_raw()  # causes the size to be reset to None
-            self.still_configuration = self.create_still_configuration()
-            self.still_configuration.enable_raw()  # ditto
-            self.video_configuration = self.create_video_configuration()
-            self.video_configuration.enable_raw()  # ditto
+            self.preview_configuration_ = CameraConfiguration(self.create_preview_configuration(), self)
+            self.preview_configuration_.enable_raw()  # causes the size to be reset to None
+            self.still_configuration_ = CameraConfiguration(self.create_still_configuration(), self)
+            self.still_configuration_.enable_raw()  # ditto
+            self.video_configuration_ = CameraConfiguration(self.create_video_configuration(), self)
+            self.video_configuration_.enable_raw()  # ditto
         except Exception:
             _log.error("Camera __init__ sequence did not complete.")
             raise RuntimeError("Camera __init__ sequence did not complete.")
@@ -413,12 +425,10 @@ def _grab_camera(self, idx):
         elif isinstance(idx, int):
             return self.camera_manager.cameras[idx]
 
-    def _initialize_camera(self) -> bool:
+    def _initialize_camera(self) -> None:
         """Initialize camera
 
         :raises RuntimeError: Failure to initialise camera
-        :return: True if success
-        :rtype: bool
         """
         if not self.camera_manager.cameras:
             _log.error("Camera(s) not found (Do not forget to disable legacy camera with raspi-config).")
@@ -446,7 +456,6 @@ def _initialize_camera(self) -> bool:
         self.sensor_format = self._native_mode['format']
 
         _log.info('Initialization successful.')
-        return True
 
     def __identify_camera(self):
         for idx, address in enumerate(self.camera_manager.cameras):
@@ -459,7 +468,9 @@ def _open_camera(self) -> None:
 
         :raises RuntimeError: Failed to setup camera
         """
-        if not self._initialize_camera():
+        try:
+            self._initialize_camera()
+        except RuntimeError:
             raise RuntimeError("Failed to initialize camera")
 
         # This now throws an error if it can't open the camera.
@@ -616,12 +627,12 @@ def close(self) -> None:
         self.streams = None
         self.stream_map = None
         self.camera = None
-        self.camera_ctrl_info = None
+        self.camera_ctrl_info = {}
         self.camera_config = None
         self.libcamera_config = None
-        self.preview_configuration_ = None
-        self.still_configuration_ = None
-        self.video_configuration_ = None
+        self.preview_configuration = {}
+        self.still_configuration = {}
+        self.video_configuration = {}
         self.notifymeread.close()
         os.close(self.notifyme_w)
         # Clean up the allocator
@@ -643,7 +654,7 @@ def _make_initial_stream_config(stream_config: dict, updates: dict, ignore_list=
         """
         if updates is None:
             return None
-        valid = ("format", "size", "stride")
+        valid = ("format", "size", "stride", "preserve_ar")
         for key, value in updates.items():
             if isinstance(value, SensorFormat):
                 value = str(value)
@@ -680,9 +691,9 @@ def create_preview_configuration(self, main={}, lores=None, raw={}, transform=li
         if not self._is_rpi_camera():
             raw = None
             sensor = None
-        main = self._make_initial_stream_config({"format": "XBGR8888", "size": (640, 480)}, main)
+        main = self._make_initial_stream_config({"format": "XBGR8888", "size": (640, 480), "preserve_ar": True}, main)
         self.align_stream(main, optimal=False)
-        lores = self._make_initial_stream_config({"format": "YUV420", "size": main["size"]}, lores)
+        lores = self._make_initial_stream_config({"format": "YUV420", "size": main["size"], "preserve_ar": False}, lores)
         if lores is not None:
             self.align_stream(lores, optimal=False)
         raw = self._make_initial_stream_config({"format": self.sensor_format, "size": main["size"]},
@@ -714,9 +725,10 @@ def create_still_configuration(self, main={}, lores=None, raw={}, transform=libc
         if not self._is_rpi_camera():
             raw = None
             sensor = None
-        main = self._make_initial_stream_config({"format": "BGR888", "size": self.sensor_resolution}, main)
+        main = self._make_initial_stream_config({"format": "BGR888", "size": self.sensor_resolution, "preserve_ar": True},
+                                                main)
         self.align_stream(main, optimal=False)
-        lores = self._make_initial_stream_config({"format": "YUV420", "size": main["size"]}, lores)
+        lores = self._make_initial_stream_config({"format": "YUV420", "size": main["size"], "preserve_ar": False}, lores)
         if lores is not None:
             self.align_stream(lores, optimal=False)
         raw = self._make_initial_stream_config({"format": self.sensor_format, "size": main["size"]},
@@ -748,9 +760,9 @@ def create_video_configuration(self, main={}, lores=None, raw={}, transform=libc
         if not self._is_rpi_camera():
             raw = None
             sensor = None
-        main = self._make_initial_stream_config({"format": "XBGR8888", "size": (1280, 720)}, main)
+        main = self._make_initial_stream_config({"format": "XBGR8888", "size": (1280, 720), "preserve_ar": True}, main)
         self.align_stream(main, optimal=False)
-        lores = self._make_initial_stream_config({"format": "YUV420", "size": main["size"]}, lores)
+        lores = self._make_initial_stream_config({"format": "YUV420", "size": main["size"], "preserve_ar": False}, lores)
         if lores is not None:
             self.align_stream(lores, optimal=False)
         raw = self._make_initial_stream_config({"format": self.sensor_format, "size": main["size"]},
@@ -933,13 +945,13 @@ def score_format(desired, actual):
     @staticmethod
     def align_stream(stream_config: dict, optimal=True) -> None:
         if optimal:
-            # Adjust the image size so that all planes are a mutliple of 32 bytes wide.
+            # Adjust the image size so that all planes are a mutliple of 32/64 bytes wide.
             # This matches the hardware behaviour and means we can be more efficient.
-            align = 32
+            align = 32 if Picamera2.platform == Platform.Platform.VC4 else 64
             if stream_config["format"] in ("YUV420", "YVU420"):
-                align = 64  # because the UV planes will have half this alignment
-            elif stream_config["format"] in ("XBGR8888", "XRGB8888"):
-                align = 16  # 4 channels per pixel gives us an automatic extra factor of 2
+                align *= 2  # because the UV planes will have half this alignment
+            elif stream_config["format"] in ("XBGR8888", "XRGB8888", "RGB161616", "BGR161616"):
+                align //= 2  # we have an automatic extra factor of 2 here
         else:
             align = 2
         size = stream_config["size"]
@@ -1109,9 +1121,17 @@ def configure_(self, camera_config="preview") -> None:
         self.controls = Controls(self, controls=self.camera_config['controls'])
         self.configure_count += 1
 
-    def configure(self, camera_config="preview") -> None:
-        """Configure the camera system with the given configuration."""
-        self.configure_(camera_config)
+        if "ScalerCrops" in self.camera_controls:
+            par_crop = self.camera_controls["ScalerCrops"]
+            full_fov = self.camera_controls["ScalerCrop"][1]
+            scaler_crops = [par_crop[0] if camera_config["main"]["preserve_ar"] else full_fov]
+            if self.lores_index >= 0:
+                scaler_crops.append(par_crop[1] if camera_config["lores"]["preserve_ar"] else scaler_crops[0])
+            self.set_controls({"ScalerCrops": scaler_crops})
+
+    def configure(self, camera_config=None) -> None:
+        """Configure the camera system with the given configuration. Defaults to the 'preview' configuration."""
+        self.configure_("preview" if camera_config is None else camera_config)
 
     def camera_configuration(self) -> dict:
         """Return the camera configuration."""
@@ -1164,6 +1184,20 @@ def start(self, config=None, show_preview=False) -> None:
             self.start_preview(show_preview)
         self.start_()
 
+    def cancel_all_and_flush(self) -> None:
+        """
+        Clear the camera system queue of pending jobs and cancel them.
+
+        Depending on what was happening at the time, this may leave the camera system in
+        an indeterminate state. This function is really only intended for tidying up
+        after an operation has unexpectedly timed out (for example, the camera cable has
+        become dislodged) so that the camera can be closed.
+        """
+        with self.lock:
+            for job in self._job_list:
+                job.cancel()
+            self._job_list = []
+
     def stop_(self, request=None) -> None:
         """Stop the camera.
 
@@ -1302,9 +1336,18 @@ def dispatch_functions(self, functions, wait, signal_function=None, immediate=Fa
         When there are multiple items each will be processed on a separate
         trip round the event loop, meaning that a single operation could stop and restart the
         camera and the next operation would receive a request from after the restart.
+
+        The wait parameter should be one of:
+            True - wait as long as necessary for the operation to compelte
+            False - return immediately, giving the caller a "job" they can wait for
+            None - default, if a signal_function was given do not wait, otherwise wait as long as necessary
+            a number - wait for this number of seconds before raising a "timed out" error.
         """
         if wait is None:
             wait = signal_function is None
+        timeout = wait
+        if timeout is True:
+            timeout = None
         with self.lock:
             only_job = not self._job_list
             job = Job(functions, signal_function)
@@ -1316,7 +1359,7 @@ def dispatch_functions(self, functions, wait, signal_function=None, immediate=Fa
             # stop commands, for which no requests are needed).
             if only_job and (self.completed_requests or immediate):
                 self._run_process_requests()
-        return job.get_result() if wait else job
+        return job.get_result(timeout=timeout) if wait else job
 
     def set_frame_drops_(self, num_frames):
         """Only for use within the camera event loop before calling drop_frames_."""  # noqa
@@ -1476,6 +1519,15 @@ def capture_request_and_stop_(self):
                      partial(capture_request_and_stop_, self)]
         return self.dispatch_functions(functions, wait, signal_function, immediate=True)
 
+    @contextlib.contextmanager
+    def captured_request(self, wait=None, flush=None):
+        """Capture a completed request using the context manager which guarantees its release."""
+        request = self.capture_request(wait=wait, flush=flush)
+        try:
+            yield request
+        finally:
+            request.release()
+
     def capture_metadata_(self):
         if not self.completed_requests:
             return (False, None)
@@ -1613,7 +1665,7 @@ def capture_arrays_and_switch_back_(self, preview_config, names):
                      partial(capture_arrays_and_switch_back_, self, preview_config, names)]
         return self.dispatch_functions(functions, wait, signal_function, immediate=True)
 
-    def capture_image_(self, name: str) -> Image:
+    def capture_image_(self, name: str) -> Image.Image:
         """Capture image
 
         :param name: Stream name
@@ -1626,7 +1678,7 @@ def capture_image_(self, name: str) -> Image:
         request.release()
         return (True, result)
 
-    def capture_image(self, name: str = "main", wait: bool = None, signal_function=None) -> Image:
+    def capture_image(self, name: str = "main", wait: bool = None, signal_function=None) -> Image.Image:
         """Make a PIL image from the next frame in the named stream.
 
         :param name: Stream name, defaults to "main"
@@ -1636,19 +1688,19 @@ def capture_image(self, name: str = "main", wait: bool = None, signal_function=N
         :param signal_function: Callback, defaults to None
         :type signal_function: function, optional
         :return: PIL Image
-        :rtype: Image
+        :rtype: Image.Image
         """
         return self.dispatch_functions([partial(self.capture_image_, name)], wait, signal_function)
 
     def switch_mode_and_capture_image(self, camera_config, name: str = "main", wait: bool = None,
-                                      signal_function=None, delay=0) -> Image:
+                                      signal_function=None, delay=0) -> Image.Image:
         """Switch the camera into a new (capture) mode, capture the image.
 
         Then return back to the initial camera mode.
         """
         preview_config = self.camera_config
 
-        def capture_image_and_switch_back_(self, preview_config, name) -> Image:
+        def capture_image_and_switch_back_(self, preview_config, name) -> Image.Image:
             done, result = self.capture_image_(name)
             if not done:
                 return (False, None)
diff --git a/picamera2/platform.py b/picamera2/platform.py
index d55a074d..dfa11855 100644
--- a/picamera2/platform.py
+++ b/picamera2/platform.py
@@ -12,17 +12,17 @@ class Platform(Enum):
 
 _platform = Platform.VC4
 try:
-    for num in range(5):
+    for num in range(64):
         device = '/dev/video' + str(num)
         if os.path.exists(device):
             with open(device, 'rb+', buffering=0) as fd:
                 caps = v4l2.v4l2_capability()
                 fcntl.ioctl(fd, v4l2.VIDIOC_QUERYCAP, caps)
                 decoded = caps.card.decode('utf-8')
-                if decoded == "rp1-cfe":
+                if decoded == "pispbe":
                     _platform = Platform.PISP
                     break
-                elif decoded == "unicam":
+                elif decoded == "bcm2835-isp":
                     break
 except Exception:
     pass
diff --git a/picamera2/previews/drm_preview.py b/picamera2/previews/drm_preview.py
index e6cba441..06fd2cba 100644
--- a/picamera2/previews/drm_preview.py
+++ b/picamera2/previews/drm_preview.py
@@ -63,7 +63,7 @@ class DrmPreview(NullPreview):
     def __init__(self, x=0, y=0, width=640, height=480, transform=None):
         self.init_drm(x, y, width, height, transform)
         self.stop_count = 0
-        self.fb = pykms.DumbFramebuffer(self.card, width, height, "AB24")
+        self.fb = pykms.DumbFramebuffer(self.card, width, height, "XB24")
         self.mem = mmap.mmap(self.fb.fd(0), width * height * 3, mmap.MAP_SHARED, mmap.PROT_WRITE)
         self.fd = self.fb.fd(0)
         super().__init__(width=width, height=height)
@@ -104,6 +104,8 @@ def set_overlay(self, overlay):
             raise RuntimeError("Preview must be configured before setting an overlay")
         if self.picam2.camera_config['buffer_count'] < 2:
             raise RuntimeError("Need at least buffer_count=2 to set overlay")
+        if self.overlay_plane is None:
+            raise RuntimeError("Overlays not supported on this device")
 
         if overlay is None:
             self.overlay_new_fb = None
@@ -150,23 +152,28 @@ def render_drm(self, picam2, completed_request):
 
             self.plane = self.resman.reserve_overlay_plane(self.crtc, format=fmt)
             if self.plane is None:
-                raise RuntimeError("Failed to reserve DRM plane")
+                # Some display devices may not support "alpha".
+                self.plane = self.resman.reserve_plane(self.crtc, type=pykms.PlaneType.Primary, format=fmt)
+                if self.plane is None:
+                    raise RuntimeError("Failed to reserve DRM plane")
             drm_rotation = 1
             if self.transform.hflip:
                 drm_rotation |= 16
             if self.transform.vflip:
                 drm_rotation |= 32
-            self.plane.set_prop("rotation", drm_rotation)
-            # The second plane we ask for will go on top of the first.
-            self.overlay_plane = self.resman.reserve_overlay_plane(self.crtc, format=pykms.PixelFormat.ABGR8888)
-            if self.overlay_plane is None:
-                raise RuntimeError("Failed to reserve DRM overlay plane")
-            # Want "coverage" mode, not pre-multiplied alpha. fkms doesn't seem to have this
-            # property so we suppress the error, but it seems to have the right behaviour anyway.
             try:
-                self.overlay_plane.set_prop("pixel blend mode", 1)
+                self.plane.set_prop("rotation", drm_rotation)
             except RuntimeError:
                 pass
+            # The second plane we ask for will go on top of the first.
+            self.overlay_plane = self.resman.reserve_overlay_plane(self.crtc, format=pykms.PixelFormat.ABGR8888)
+            if self.overlay_plane is not None:
+                # Want "coverage" mode, not pre-multiplied alpha. fkms doesn't seem to have this
+                # property so we suppress the error, but it seems to have the right behaviour anyway.
+                try:
+                    self.overlay_plane.set_prop("pixel blend mode", 1)
+                except RuntimeError:
+                    pass
 
         # Use an atomic commit for rendering
         ctx = pykms.AtomicReq(self.card)
diff --git a/picamera2/utils.py b/picamera2/utils.py
index 835cfa1c..ea597a97 100644
--- a/picamera2/utils.py
+++ b/picamera2/utils.py
@@ -5,11 +5,11 @@
 
 def convert_from_libcamera_type(value):
     if isinstance(value, Rectangle):
-        value = (value.x, value.y, value.width, value.height)
+        value = value.to_tuple()
     elif isinstance(value, Size):
-        value = (value.width, value.height)
+        value = value.to_tuple()
     elif isinstance(value, (list, tuple)) and all(isinstance(item, Rectangle) for item in value):
-        value = [(v.x, v.y, v.width, v.height) for v in value]
+        value = [v.to_tuple() for v in value]
     return value
 
 
diff --git a/setup.py b/setup.py
index f7d8ce1e..f7b98801 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 
 setup(
     name='picamera2',
-    version='0.3.18',
+    version='0.3.24',
     description='The libcamera-based Python interface to Raspberry Pi cameras, based on the original Picamera library',
     long_description=long_description,
     long_description_content_type='text/markdown',
@@ -30,8 +30,12 @@
         "Programming Language :: Python :: 3.9",
         "Topic :: Multimedia :: Graphics :: Capture :: Digital Camera",
     ],
-    packages=['picamera2', 'picamera2.encoders', 'picamera2.outputs', 'picamera2.previews', 'picamera2.allocators'],
+    packages=['picamera2', 'picamera2.devices', 'picamera2.devices.hailo', 'picamera2.devices.imx500',
+              'picamera2.devices.imx708', 'picamera2.encoders', 'picamera2.outputs', 'picamera2.previews',
+              'picamera2.allocators'],
     python_requires='>=3.9',
     licence='BSD 2-Clause License',
-    install_requires=['numpy', 'PiDNG', 'piexif', 'pillow', 'simplejpeg', 'v4l2-python3', 'python-prctl', 'av'],
+    install_requires=['numpy', 'PiDNG', 'piexif', 'pillow', 'simplejpeg', 'v4l2-python3',
+                      'python-prctl', 'av', 'libarchive-c', 'tqdm',
+                      'jsonschema'],
     extras_require={"gui": ['pyopengl', 'PyQt5']})
diff --git a/tests/alignment.py b/tests/alignment.py
new file mode 100755
index 00000000..d2d04d81
--- /dev/null
+++ b/tests/alignment.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+
+from picamera2 import Picamera2, Platform
+
+picam2 = Picamera2()
+
+
+def compute_expected_stride(width, format):
+    if format in ("BGR888", "RGB888"):
+        return width * 3
+    elif format in ("XBGR8888", "XRGB8888"):
+        return width * 4
+    elif format in ("YUV420", "YVU420"):
+        return width
+    elif format in ("RGB161616", "BGR161616"):
+        return width * 6
+
+
+def test_alignment(width, format):
+    config = picam2.create_preview_configuration({'size': (width, 480), 'format': format}, buffer_count=1)
+    picam2.align_configuration(config)
+    picam2.configure(config)
+    actual_stride = config['main']['stride']
+    actual_width = config['main']['size'][0]
+    expected_stride = compute_expected_stride(actual_width, format)
+    if actual_stride != expected_stride:
+        print("ERROR: stride", actual_stride, "!=", expected_stride, "for format", format)
+        return 1
+    return 0
+
+
+formats = ["RGB888", "XRGB8888", "YUV420"]
+if picam2.platform == Platform.PISP:
+    formats.append("RGB161616")
+
+failures = 0
+for format in formats:
+    for width in range(512, 1025, 2):
+        print(format)
+        failures += test_alignment(width, format)
+
+print("Failures:", failures)
diff --git a/tests/check_timestamps.py b/tests/check_timestamps.py
index 48747f89..3bebba0c 100755
--- a/tests/check_timestamps.py
+++ b/tests/check_timestamps.py
@@ -11,7 +11,7 @@
 class TimestampCollector(Output):
     """Output class that doesn't output anything but collects frame timestamps"""
 
-    def outputframe(self, frame, keyframe=True, timestamp=None):
+    def outputframe(self, frame, keyframe=True, timestamp=None, packet=None, audio=False):
         if timestamp is not None:
             timestamps.append(timestamp)
 
diff --git a/tests/crop_test.py b/tests/crop_test.py
new file mode 100644
index 00000000..350123f9
--- /dev/null
+++ b/tests/crop_test.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python3
+
+# Test setting the "preserve_ar" stream configuration flag
+
+import cv2
+
+from picamera2 import Picamera2, Platform
+
+# VC4 platforms do not support different crops for the two outputs.
+if Picamera2.platform == Platform.VC4:
+    print("SKIPPED (VC4 platform)")
+    quit(0)
+
+picam2 = Picamera2()
+
+for m, l in [(False, False), (False, True), (True, False), (True, True)]:
+    cfg = picam2.create_video_configuration(main={"size": (1920, 1080), "format": 'XRGB8888', "preserve_ar": m},
+                                            lores={"size": (320, 320), "format": 'XRGB8888', "preserve_ar": l},
+                                            display="main")
+    picam2.configure(cfg)
+    picam2.start(show_preview=True)
+
+    for _ in range(50):
+        im = picam2.capture_array("lores")
+        cv2.imshow("lores", im)
+        cv2.waitKey(1)
+
+    picam2.stop()
diff --git a/tests/grey_world.py b/tests/grey_world.py
new file mode 100755
index 00000000..7edaafac
--- /dev/null
+++ b/tests/grey_world.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python3
+import time
+
+from picamera2 import Picamera2
+
+# Here we load up the tuning for the HQ cam and alter the default exposure profile.
+# For more information on what can be changed, see chapter 5 in
+# https://datasheets.raspberrypi.com/camera/raspberry-pi-camera-guide.pdf
+
+tuning = Picamera2.load_tuning_file("imx477.json")
+awb = Picamera2.find_tuning_algo(tuning, "rpi.awb")
+awb.clear()
+awb['bayes'] = 0
+picam2 = Picamera2(tuning=tuning)
+picam2.configure(picam2.create_preview_configuration())
+picam2.start()
+time.sleep(2)
+picam2.stop()
diff --git a/tests/imx708_device.py b/tests/imx708_device.py
new file mode 100755
index 00000000..4ee32787
--- /dev/null
+++ b/tests/imx708_device.py
@@ -0,0 +1,28 @@
+#!/bin/python3
+
+from picamera2 import Picamera2
+from picamera2.devices.imx708 import IMX708
+
+camera_info = Picamera2.global_camera_info()
+camera_num = next((c['Num'] for c in camera_info if c['Model'] == 'imx708'), None)
+
+if camera_num is not None:
+    with IMX708(camera_num) as cam:
+        cam.set_sensor_hdr_mode(True)
+        picam2 = Picamera2(camera_num)
+        if len(picam2.sensor_modes) != 1:
+            print("ERROR: We should only report 1 sensor HDR mode")
+        picam2.close()
+
+        cam.set_sensor_hdr_mode(False)
+        picam2 = Picamera2(camera_num)
+        if len(picam2.sensor_modes) <= 1:
+            print("ERROR: We should report > 1 sensor non-HDR modes")
+        picam2.close()
+
+    cam = IMX708(camera_num)
+    cam.set_sensor_hdr_mode(True)
+    picam2 = Picamera2(camera_num)
+    if len(picam2.sensor_modes) != 1:
+        print("ERROR: We should only report 1 sensor HDR mode")
+    picam2.close()
diff --git a/tests/mode_test.py b/tests/mode_test.py
index 77f45387..a040943d 100755
--- a/tests/mode_test.py
+++ b/tests/mode_test.py
@@ -30,7 +30,7 @@ def check(raw_config, fps):
     set_format = SensorFormat(camera_config["raw"]["format"])
     requested_format = SensorFormat(raw_config["format"])
     # For now, assume all our cameras are rotated 180 degrees.
-    rotation = 180  # picam2.camera_properties["Rotation"]
+    rotation = picam2.camera_properties["Rotation"]
     set_format.transform(Transform(rotation=rotation))
     # Bayer order should match, as should bit depth (taking it from the sensor config
     # if present). Insist that there either is packing of some form on both, or none.
diff --git a/tests/quality_check.py b/tests/quality_check.py
old mode 100644
new mode 100755
index e368c16d..9b2f664b
--- a/tests/quality_check.py
+++ b/tests/quality_check.py
@@ -1,7 +1,12 @@
+#!/usr/bin/python3
+
 import io
+import os
 import time
 
-from picamera2 import Picamera2
+import cv2
+
+from picamera2 import MappedArray, Picamera2
 from picamera2.encoders import H264Encoder, JpegEncoder, MJPEGEncoder, Quality
 from picamera2.outputs import FileOutput
 
@@ -10,7 +15,38 @@
 # between very low/high is so great that the behaviour should be reliable enough.
 
 
+# If ASSET_DIR is set, we can inject this file instead of using camera images.
+FILENAME = "quality_check.mp4"
+FRAMES = []
+COUNTER = 0
+SIZE = (640, 360)
+if os.environ.get('ASSET_DIR') is not None:
+    filename = os.path.join(os.path.expanduser(os.environ['ASSET_DIR']), FILENAME)
+    if os.path.isfile(filename):
+        print("Using file", filename)
+        cap = cv2.VideoCapture(filename)
+        while len(FRAMES) < 100:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame.shape != (SIZE[1], SIZE[0], 3):
+                frame = cv2.resize(frame, dsize=SIZE)
+            FRAMES.append(frame)
+else:
+    print("Using camera")
+
+
+def callback(request):
+    if FRAMES:
+        global COUNTER
+        with MappedArray(request, 'main') as m:
+            m.array[...] = FRAMES[COUNTER]
+        COUNTER = (COUNTER + 1) % len(FRAMES)
+
+
 def do_encode(encoder, quality):
+    global COUNTER
+    COUNTER = 0
     data = io.BytesIO()
     picam2.start_encoder(encoder, output=FileOutput(data), name='main', quality=quality)
     time.sleep(time_seconds)
@@ -19,25 +55,26 @@ def do_encode(encoder, quality):
 
 
 picam2 = Picamera2()
-config = picam2.create_video_configuration({'format': 'RGB888', 'size': (640, 360)}, lores={'size': (640, 360)})
+config = picam2.create_video_configuration({'format': 'RGB888', 'size': SIZE}, lores={'size': SIZE})
 picam2.configure(config)
 picam2.start()
+picam2.pre_callback = callback
 time_seconds = 5
 
 low_quality = do_encode(MJPEGEncoder(), Quality.VERY_LOW)
 high_quality = do_encode(MJPEGEncoder(), Quality.VERY_HIGH)
 print("MJPEGEncoder: low quality", low_quality, "high quality", high_quality)
-if (low_quality > high_quality):
+if (1.5 * low_quality > high_quality):
     print("Error: MJPEGEncoder file sizes not as expected")
 
 low_quality = do_encode(H264Encoder(), Quality.VERY_LOW)
 high_quality = do_encode(H264Encoder(), Quality.VERY_HIGH)
 print("H264Encoder: low quality", low_quality, "high quality", high_quality)
-if (low_quality > high_quality):
+if (1.5 * low_quality > high_quality):
     print("Error: H264Encoder file sizes not as expected")
 
 low_quality = do_encode(JpegEncoder(), Quality.VERY_LOW)
 high_quality = do_encode(JpegEncoder(), Quality.VERY_HIGH)
 print("JpegEncoder: low quality", low_quality, "high quality", high_quality)
-if (low_quality > high_quality):
+if (1.5 * low_quality > high_quality):
     print("Error: JpegEncoder file sizes not as expected")
diff --git a/tests/stride_test.py b/tests/stride_test.py
new file mode 100755
index 00000000..b1b7e4ea
--- /dev/null
+++ b/tests/stride_test.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python3
+
+import time
+
+from picamera2 import MappedArray, Picamera2, libcamera
+
+
+def pre_callback(request):
+    # Set the size, to make preview window and MappedArray remapping work
+    assert request.config["main"]["stride"] == stride
+    request.config["main"]["size"] = full_size
+    request.stream_map["main"].configuration.size = libcamera.Size(*full_size)
+
+
+def post_callback(request):
+    # Make right side grey
+    with MappedArray(request, "main") as m1:
+        a1 = m1.array
+        a1[:, -a1.shape[1] // 2:] = 70
+
+
+picam2 = Picamera2(0)
+
+full_size = (1920, 1080)
+half_size = (full_size[0] // 2, full_size[1])
+# Calculate stride for full frame
+full_config = picam2.create_preview_configuration({"size": full_size})
+picam2.configure(full_config)
+stride = picam2.camera_config["main"]["stride"]
+
+# Configure as half frame, with full frame stride so right side is blank
+picam2.pre_callback = pre_callback
+picam2.post_callback = post_callback
+main_config = picam2.create_preview_configuration(
+    main={"size": half_size, "stride": stride}
+)
+picam2.configure(main_config)
+picam2.start_preview(True)
+
+picam2.start()
+time.sleep(2)
diff --git a/tests/test_list.txt b/tests/test_list.txt
index 37334543..b9a921b9 100644
--- a/tests/test_list.txt
+++ b/tests/test_list.txt
@@ -29,10 +29,14 @@ examples/opencv_mertens_merge.py
 examples/overlay_gl.py
 examples/overlay_null.py
 examples/overlay_qt.py
+examples/picamera2_multiprocessing.py
 examples/pick_mode.py
 examples/preview.py
 examples/preview_x_forwarding.py
+examples/pyav_capture.py
+examples/pyav_circular_capture.py
 examples/raw.py
+examples/request_context_manager.py
 examples/rotation.py
 examples/still_capture_with_config.py
 examples/still_during_video.py
@@ -45,6 +49,7 @@ examples/tuning_file.py
 examples/video_with_config.py
 examples/window_offset.py
 examples/zoom.py
+tests/alignment.py
 tests/app_dual.py
 tests/app_full_test.py
 tests/app_test.py
@@ -59,12 +64,15 @@ tests/colour_spaces.py
 tests/config_with_sensor.py
 tests/configurations.py
 tests/context_test.py
+tests/crop_test.py
 tests/display_transform_null.py
 tests/display_transform_qt.py
 tests/easy_video2.py
 tests/egl_leak.py
 tests/encoder_start_stop.py
 tests/ffmpeg_abort.py
+tests/grey_world.py
+tests/imx708_device.py
 tests/large_datagram.py
 tests/mjpeg_server.py
 tests/no_raw.py
@@ -80,3 +88,5 @@ tests/qt_gl_preview_test.py
 tests/stop_slow_framerate.py
 tests/allocator_test.py
 tests/allocator_leak_test.py
+tests/wait_cancel_test.py
+tests/stride_test.py
diff --git a/tests/wait_cancel_test.py b/tests/wait_cancel_test.py
new file mode 100755
index 00000000..747c04df
--- /dev/null
+++ b/tests/wait_cancel_test.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+
+import time
+
+from picamera2 import CancelledError, Picamera2, TimeoutError
+
+# At 2 fps should take over 3s to see the first frame.
+controls = {'FrameRate': 2}
+
+with Picamera2() as picam2:
+    config = picam2.create_preview_configuration(controls=controls)
+    picam2.start(config)
+    t0 = time.monotonic()
+
+    # Test that we time out correctly, and that we can cancel everything so
+    # that we stop quickly.
+    try:
+        array = picam2.capture_array(wait=1.0)
+    except TimeoutError:
+        print("Timed out")
+    else:
+        print("ERROR: operation did not time out")
+
+    t1 = time.monotonic()
+    if t1 - t0 > 2.0:
+        print("ERROR: time out appears to have taken too long")
+
+    picam2.cancel_all_and_flush()
+    picam2.stop()
+    t2 = time.monotonic()
+    print("Stopping took", t2 - t1, "seconds")
+    if t2 - t1 > 0.1:
+        print(f"ERROR: stopping took too long ({t2-t1} seconds)")
+
+with Picamera2() as picam2:
+    config = picam2.create_preview_configuration(controls=controls)
+    picam2.start(config)
+    t0 = time.monotonic()
+
+    # Test that we can cancel a job and get a correct CancelledError.
+    job = picam2.capture_array(wait=False)
+    picam2.cancel_all_and_flush()
+
+    try:
+        array = job.get_result()
+    except CancelledError:
+        print("Job was cancelled")
+    else:
+        print("ERROR: job was not cancelled")
+
+    t1 = time.monotonic()
+    if t1 - t0 > 0.5:
+        print("ERROR: job took too long to cancel")