Hey CanMV Community,
I’m new to hardware programming and currently working on face detection using the K230 (hardware v1.1) with CanMV Micropython. Below are the functions I’ve added for post-processing to improve the shape of detected faces. However, I’m facing performance issues when using them.
Steps to Reproduce
- Set up a face detection pipeline on the K230 with CanMV Micropython, using a YuNet model for detection.
- Add the following functions (see code below) to post-process and align detected faces based on eye landmarks.
- Run the pipeline with
rgb888p_size = [1920, 1080]
anddisplay_size = [1920, 1080]
on an HDMI display, processing live video frame by frame. - Observe the system behavior during continuous face detection.
Expected Results and Actual Results
- Expected Results: Smooth live face detection at a reasonable frame rate (e.g., 10-15 FPS) with properly aligned faces for further processing (e.g., recognition).
- Actual Results: When using these alignment functions, the K230 lags significantly, often freezing or stuttering during live detection. Without these functions, detection runs smoother (around 10-12 FPS), with occasional minor stutters (once per hour), but it’s much less laggy compared to when the alignment code is included.
Software and Hardware Version Information
- Hardware: K230 development board, version 1.1.
- Software: CanMV Micropython (latest version as of March 17, 2025), YuNet model (
k230_face_detection_yunet.kmodel
).
Error Log
- No specific error messages appear in the console.
- The system slows down or freezes intermittently when the alignment functions are active, especially during live frame processing. Timing logs (e.g., via
ScopedTiming
) aren’t captured yet, but the lag is visually noticeable.
Solution Attempts
- Removed the alignment code: Detection runs smoothly without
align_img_wrt_eyes
and related functions, though minor stutters still occur occasionally (once per hour). - Adjusted resolution: Tested with
rgb888p_size = [640, 320]
, which reduces lag but compromises detection accuracy, so I reverted to 1920x1080. - Considered optimization: Thought about simplifying the
M = np.array
transformation or offloading it to hardware, but I’m unsure how to proceed with the K230’s capabilities.
Supplementary Materials
Here are the functions I’m using to post-process and align faces after detection:
import ulab.numpy as np
def align_img_wrt_eyes(img, left_eye, right_eye):
if not left_eye or not right_eye:
return img, 0
angle = float(np.degrees(np.arctan2(left_eye[1] - right_eye[1], left_eye[0] - right_eye[0])))
# Handle NCHW format (C, H, W)
if len(img.shape) == 3 and img.shape[0] == 3: # NCHW
channels, h, w = img.shape
elif len(img.shape) == 3 and img.shape[2] == 3: # NHWC
h, w, channels = img.shape
elif len(img.shape) == 2: # Grayscale
h, w = img.shape
channels = 1
else:
raise ValueError("Unsupported image shape: {}".format(img.shape))
center = (w // 2, h // 2)
cos_val = np.cos(np.radians(angle))
sin_val = np.sin(np.radians(angle))
M = np.array([
[cos_val, sin_val, (1 - cos_val) * center[0] - sin_val * center[1]],
[-sin_val, cos_val, sin_val * center[0] + (1 - cos_val) * center[1]]
], dtype=np.float)
# Process based on input format
if len(img.shape) == 3 and img.shape[0] == 3: # NCHW
aligned_img = np.zeros((channels, h, w), dtype=np.uint8)
for i in range(h):
for j in range(w):
src_x = int(M[0, 0] * j + M[0, 1] * i + M[0, 2])
src_y = int(M[1, 0] * j + M[1, 1] * i + M[1, 2])
if 0 <= src_x < w and 0 <= src_y < h:
for k in range(channels):
aligned_img[k, i, j] = img[k, src_y, src_x]
elif len(img.shape) == 3 and img.shape[2] == 3: # NHWC
aligned_img = np.zeros((h, w, channels), dtype=np.uint8)
for i in range(h):
for j in range(w):
src_x = int(M[0, 0] * j + M[0, 1] * i + M[0, 2])
src_y = int(M[1, 0] * j + M[1, 1] * i + M[1, 2])
if 0 <= src_x < w and 0 <= src_y < h:
for k in range(channels):
aligned_img[i, j, k] = img[src_y, src_x, k]
elif len(img.shape) == 2: # Grayscale
aligned_img = np.zeros((h, w), dtype=np.uint8)
for i in range(h):
for j in range(w):
src_x = int(M[0, 0] * j + M[0, 1] * i + M[0, 2])
src_y = int(M[1, 0] * j + M[1, 1] * i + M[1, 2])
if 0 <= src_x < w and 0 <= src_y < h:
aligned_img[i, j] = img[src_y, src_x]
return aligned_img, angle
def project_facial_area(facial_area, angle, size):
# Rotate the four corners of the bounding box
x1, y1, x2, y2 = facial_area
h, w = size
center = (w // 2, h // 2)
cos_val = np.cos(np.radians(angle))
sin_val = np.sin(np.radians(angle))
def rotate_point(x, y):
x -= center[0]
y -= center[1]
new_x = x * cos_val + y * sin_val + center[0]
new_y = -x * sin_val + y * cos_val + center[1]
return new_x, new_y
rx1, ry1 = rotate_point(x1, y1)
rx2, ry2 = rotate_point(x2, y2)
return min(rx1, rx2), min(ry1, ry2), max(rx1, rx2), max(ry1, ry2)
def expand_and_align_face(img, x, y, w, h, landmarks, align=True, expand_percentage=0):
left_eye = (landmarks[0], landmarks[1]) if len(landmarks) >= 2 else None
right_eye = (landmarks[2], landmarks[3]) if len(landmarks) >= 4 else None
if expand_percentage > 0:
expanded_w = w + int(w * expand_percentage / 100)
expanded_h = h + int(h * expand_percentage / 100)
x = max(0, x - int((expanded_w - w) / 2))
y = max(0, y - int((expanded_h - h) / 2))
w = min(img.shape[1] - x, expanded_w)
h = min(img.shape[0] - y, expanded_h)
# Ensure valid dimensions
if w <= 0 or h <= 0:
return {"x": x, "y": y, "w": 0, "h": 0, "left_eye": None, "right_eye": None,
"nose": None, "mouth_left": None, "mouth_right": None}, None
detected_face = img[y:y+h, x:x+w]
if align and left_eye and right_eye:
aligned_face, angle = align_img_wrt_eyes(detected_face, left_eye, right_eye)
rotated_x1, rotated_y1, rotated_x2, rotated_y2 = project_facial_area((0, 0, w, h), angle, (h, w))
cropped_face = aligned_face[int(rotated_y1):int(rotated_y2), int(rotated_x1):int(rotated_x2)]
else:
cropped_face = detected_face
angle = 0
facial_area = {
"x": x, "y": y, "w": w, "h": h,
"left_eye": left_eye, "right_eye": right_eye,
"nose": (landmarks[4], landmarks[5]) if len(landmarks) >= 6 else None,
"mouth_left": (landmarks[6], landmarks[7]) if len(landmarks) >= 8 else None,
"mouth_right": (landmarks[8], landmarks[9]) if len(landmarks) >= 10 else None
}
return facial_area, cropped_face