Crops to rowing machine screen - can be trained with optimize_crop.py and screen_classifier

2026-03-16 13:46:02 +00:00
parent 2e386a4297
commit f0184319c6
4 changed files with 309 additions and 31 deletions
--- a/crop_to_screen.py
+++ b/crop_to_screen.py
@@ -18,26 +18,39 @@ import glob
 import sys


+def order_corners(pts):
+    """Order 4 points as [top-left, top-right, bottom-right, bottom-left]."""
+    rect = np.zeros((4, 2), dtype="float32")
+    s = pts.sum(axis=1)
+    rect[0] = pts[np.argmin(s)]
+    rect[2] = pts[np.argmax(s)]
+    d = np.diff(pts, axis=1)
+    rect[1] = pts[np.argmin(d)]
+    rect[3] = pts[np.argmax(d)]
+    return rect
+
+
 def find_screen(image):
    """
    Detect the Concept 2 PM5 LCD screen region in the image.

-    Returns (x, y, w, h) bounding box or None if not found.
+    Returns (x, y, w, h, contour) or None if not found.
+    The contour is the best-matching contour for perspective correction.
    """
    h_img, w_img = image.shape[:2]
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Pre-compute edge map for internal-content scoring
-    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
-    edges = cv2.Canny(blurred, 50, 150)
+    blurred = cv2.GaussianBlur(gray, (11, 11), 0)
+    edges = cv2.Canny(blurred, 80, 100)

    candidates = []

    # Sweep brightness thresholds — screen brightness varies by
    # lighting conditions (ranges from ~100 in dim gyms to ~200+)
-    for thresh_val in range(120, 200, 10):
+    for thresh_val in range(70, 210, 10):
        _, thresh = cv2.threshold(gray, thresh_val, 255, cv2.THRESH_BINARY)
-        kern = cv2.getStructuringElement(cv2.MORPH_RECT, (11, 11))
+        kern = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
        thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kern)
        thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kern)

@@ -54,39 +67,65 @@ def find_screen(image):

            # Size: screen is a small-to-medium portion of the photo
            area_ratio = rect_area / (h_img * w_img)
-            if area_ratio < 0.005 or area_ratio > 0.12:
+            if area_ratio < 0.004480508227271387 or area_ratio > 0.13807760800032298:
                continue

-            # Aspect ratio: LCD is roughly square (0.5 to 1.6)
+            # Aspect ratio: LCD is roughly square
            aspect = w / h
-            if aspect < 0.5 or aspect > 1.6:
+            if aspect < 0.6831978184146027 or aspect > 1.9505294279578584:
                continue

            # Rectangularity
            rectangularity = area / rect_area
-            if rectangularity < 0.4:
+            if rectangularity < 0.6914579162415992:
                continue

-            # KEY: edge density — LCD with text > 0.03, plain surfaces < 0.01
+            # KEY: edge density — LCD with text has high edge density
            roi_edges = edges[y : y + h, x : x + w]
            edge_density = np.sum(roi_edges > 0) / rect_area
-            if edge_density < 0.03:
+            if edge_density < 0.012759310759672408:
                continue

            # Score: edge density * area * rectangularity
            # This favours text-rich regions that are large and well-shaped
            score = edge_density * area * rectangularity
-            candidates.append((score, x, y, w, h))
+            candidates.append((score, x, y, w, h, cnt))

    if not candidates:
        return None

    candidates.sort(key=lambda c: c[0], reverse=True)
-    return candidates[0][1:]
+    best = candidates[0]
+    return best[1], best[2], best[3], best[4], best[5]
+
+
+def perspective_correct(image, contour, dst_w, dst_h):
+    """Warp the screen quadrilateral to a flat rectangle."""
+    # Approximate contour to a polygon, tightening until we get 4 corners
+    peri = cv2.arcLength(contour, True)
+    for eps_mult in [0.02, 0.03, 0.05, 0.08, 0.10]:
+        approx = cv2.approxPolyDP(contour, eps_mult * peri, True)
+        if len(approx) == 4:
+            break
+
+    if len(approx) != 4:
+        # Fall back to the minimum area rectangle corners
+        rect = cv2.minAreaRect(contour)
+        approx = cv2.boxPoints(rect).astype(np.float32)
+    else:
+        approx = approx.reshape(4, 2).astype(np.float32)
+
+    src = order_corners(approx)
+    dst = np.array(
+        [[0, 0], [dst_w - 1, 0], [dst_w - 1, dst_h - 1], [0, dst_h - 1]],
+        dtype="float32",
+    )
+    M = cv2.getPerspectiveTransform(src, dst)
+    return cv2.warpPerspective(image, M, (dst_w, dst_h))


 def crop_screen(image_path, output_path, padding=15):
-    """Load an image, find the screen, crop and save it."""
+    """Load an image, find the screen, perspective-correct and save it."""
    image = cv2.imread(image_path)
    if image is None:
        print(f"  ERROR: Could not read {image_path}")
@@ -99,16 +138,12 @@ def crop_screen(image_path, output_path, padding=15):
        print(f"  SKIP:  No screen detected in {os.path.basename(image_path)}")
        return False

-    x, y, w, h = result
+    x, y, w, h, contour = result

-    # Add padding, clamped to image bounds
-    x1 = max(0, x - padding)
-    y1 = max(0, y - padding)
-    x2 = min(w_img, x + w + padding)
-    y2 = min(h_img, y + h + padding)
+    # Use perspective correction to flatten the screen
+    corrected = perspective_correct(image, contour, w + 2 * padding, h + 2 * padding)

-    cropped = image[y1:y2, x1:x2]
-    cv2.imwrite(output_path, cropped, [cv2.IMWRITE_JPEG_QUALITY, 95])
+    cv2.imwrite(output_path, corrected, [cv2.IMWRITE_JPEG_QUALITY, 95])
    print(
        f"  OK:    {os.path.basename(image_path)} -> {os.path.basename(output_path)}  ({w}x{h})"
    )
--- a/optimize_crop.py
+++ b/optimize_crop.py
@@ -0,0 +1,202 @@
+"""
+Optimize crop_to_screen.py parameters using Optuna.
+
+Uses the feature-based classifier from screen_classifier.py as the
+evaluation function. For each trial, runs find_screen() with suggested
+parameters on all source photos and counts how many crops are classified
+as rowing displays (label=1). Optuna maximises this count.
+
+Usage:
+    python optimize_crop.py [--n-trials 300] [--photos-dir photos/]
+"""
+
+import argparse
+import glob
+import os
+import tempfile
+
+import cv2
+import numpy as np
+import optuna
+
+from screen_classifier import cnn_predict
+
+
+def find_screen_parameterized(image, params):
+    """
+    Detect the Concept 2 PM5 LCD screen region in the image.
+    Same logic as crop_to_screen.find_screen but with tunable parameters.
+
+    Returns (x, y, w, h) bounding box or None if not found.
+    """
+    h_img, w_img = image.shape[:2]
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+    gk = params["gaussian_kernel_size"]
+    blurred = cv2.GaussianBlur(gray, (gk, gk), 0)
+    edges = cv2.Canny(blurred, params["canny_low"], params["canny_high"])
+
+    candidates = []
+
+    for thresh_val in range(params["thresh_min"], params["thresh_max"], 10):
+        _, thresh = cv2.threshold(gray, thresh_val, 255, cv2.THRESH_BINARY)
+        mk = params["morph_kernel_size"]
+        kern = cv2.getStructuringElement(cv2.MORPH_RECT, (mk, mk))
+        thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kern)
+        thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kern)
+
+        contours, _ = cv2.findContours(
+            thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+        )
+
+        for cnt in contours:
+            x, y, w, h = cv2.boundingRect(cnt)
+            area = cv2.contourArea(cnt)
+            rect_area = w * h
+            if rect_area == 0:
+                continue
+
+            area_ratio = rect_area / (h_img * w_img)
+            if (
+                area_ratio < params["area_ratio_min"]
+                or area_ratio > params["area_ratio_max"]
+            ):
+                continue
+
+            aspect = w / h
+            if aspect < params["aspect_min"] or aspect > params["aspect_max"]:
+                continue
+
+            rectangularity = area / rect_area
+            if rectangularity < params["rectangularity_min"]:
+                continue
+
+            roi_edges = edges[y : y + h, x : x + w]
+            edge_density = np.sum(roi_edges > 0) / rect_area
+            if edge_density < params["edge_density_min"]:
+                continue
+
+            score = edge_density * area * rectangularity
+            candidates.append((score, x, y, w, h))
+
+    if not candidates:
+        return None
+
+    candidates.sort(key=lambda c: c[0], reverse=True)
+    return candidates[0][1:]
+
+
+def load_images(photos_dir):
+    """Load all source images once for reuse across trials."""
+    paths = sorted(
+        glob.glob(os.path.join(photos_dir, "*.JPEG"))
+        + glob.glob(os.path.join(photos_dir, "*.jpeg"))
+        + glob.glob(os.path.join(photos_dir, "*.jpg"))
+        + glob.glob(os.path.join(photos_dir, "*.JPG"))
+    )
+    images = []
+    for p in paths:
+        img = cv2.imread(p)
+        if img is not None:
+            images.append((p, img))
+    return images
+
+
+def make_objective(images, tmp_dir, model_path):
+    """Create an Optuna objective function closed over images and tmp_dir."""
+
+    def objective(trial):
+        params = {
+            "thresh_min": trial.suggest_int("thresh_min", 60, 160, step=10),
+            "thresh_max": trial.suggest_int("thresh_max", 160, 255, step=10),
+            "morph_kernel_size": trial.suggest_int("morph_kernel_size", 3, 21, step=2),
+            "gaussian_kernel_size": trial.suggest_int(
+                "gaussian_kernel_size", 3, 11, step=2
+            ),
+            "canny_low": trial.suggest_int("canny_low", 20, 100, step=10),
+            "canny_high": trial.suggest_int("canny_high", 100, 250, step=10),
+            "area_ratio_min": trial.suggest_float("area_ratio_min", 0.001, 0.02),
+            "area_ratio_max": trial.suggest_float("area_ratio_max", 0.05, 0.30),
+            "aspect_min": trial.suggest_float("aspect_min", 0.3, 0.8),
+            "aspect_max": trial.suggest_float("aspect_max", 1.2, 2.5),
+            "rectangularity_min": trial.suggest_float("rectangularity_min", 0.2, 0.7),
+            "edge_density_min": trial.suggest_float("edge_density_min", 0.005, 0.06),
+        }
+
+        # Ensure thresh_min < thresh_max
+        if params["thresh_min"] >= params["thresh_max"]:
+            return 0
+
+        # Ensure canny_low < canny_high
+        if params["canny_low"] >= params["canny_high"]:
+            return 0
+
+        rowing_count = 0
+        for img_path, img in images:
+            result = find_screen_parameterized(img, params)
+            if result is None:
+                continue
+
+            x, y, w, h = result
+            h_img, w_img = img.shape[:2]
+            padding = 15
+            x1 = max(0, x - padding)
+            y1 = max(0, y - padding)
+            x2 = min(w_img, x + w + padding)
+            y2 = min(h_img, y + h + padding)
+            cropped = img[y1:y2, x1:x2]
+
+            # Save to temp file for the classifier
+            basename = os.path.splitext(os.path.basename(img_path))[0]
+            tmp_path = os.path.join(tmp_dir, f"{basename}_trial{trial.number}.jpg")
+            cv2.imwrite(tmp_path, cropped, [cv2.IMWRITE_JPEG_QUALITY, 95])
+
+            try:
+                label, _ = cnn_predict(tmp_path, model_path)
+                if label == 1:
+                    rowing_count += 1
+            finally:
+                # Clean up immediately to save disk space
+                if os.path.exists(tmp_path):
+                    os.remove(tmp_path)
+
+        return rowing_count
+
+    return objective
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Optimize crop_to_screen.py parameters"
+    )
+    parser.add_argument(
+        "--photos-dir", default="photos/", help="Directory of source photos"
+    )
+    parser.add_argument(
+        "--n-trials", type=int, default=300, help="Number of Optuna trials"
+    )
+    parser.add_argument(
+        "--model-path",
+        default="screen_classifier_model.pth",
+        help="Path to CNN model weights",
+    )
+    args = parser.parse_args()
+
+    images = load_images(args.photos_dir)
+    print(f"Loaded {len(images)} source images from {args.photos_dir}")
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        study = optuna.create_study(direction="maximize")
+        objective = make_objective(images, tmp_dir, args.model_path)
+        study.optimize(objective, n_trials=args.n_trials, show_progress_bar=True)
+
+    print(f"\n{'=' * 60}")
+    print(f"Best score: {study.best_value} / {len(images)} images classified as rowing")
+    print(f"Best parameters:")
+    for k, v in sorted(study.best_params.items()):
+        print(f"  {k:>25s}: {v}")
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()
--- a/screen_classifier.py
+++ b/screen_classifier.py
@@ -195,7 +195,10 @@ def get_cnn_model():


 def train_cnn(
-    data_dir: str, epochs: int = 20, lr: float = 1e-3, save_path: str = "model.pth"
+    data_dir: str,
+    epochs: int = 20,
+    lr: float = 1e-3,
+    save_path: str = "screen_classifier_model.pth",
 ):
    """
    Train the CNN. Expects data_dir with structure:
@@ -336,8 +339,9 @@ def main():
    sub = parser.add_subparsers(dest="command", required=True)

    # --- predict ---
-    p_pred = sub.add_parser("predict", help="Classify an image")
-    p_pred.add_argument("--image", required=True, help="Path to image file")
+    p_pred = sub.add_parser("predict", help="Classify an image or directory of images")
+    p_pred.add_argument("--image", help="Path to image file")
+    p_pred.add_argument("--dir", help="Path to directory of images")
    p_pred.add_argument(
        "--model",
        choices=["features", "cnn"],
@@ -361,13 +365,50 @@ def main():
    args = parser.parse_args()

    if args.command == "predict":
-        if args.model == "features":
-            label, conf = feature_based_predict(args.image, verbose=args.verbose)
-        else:
-            label, conf = cnn_predict(args.image, args.model_path)
+        if not args.image and not args.dir:
+            parser.error("predict requires --image or --dir")
+        if args.image and args.dir:
+            parser.error("--image and --dir are mutually exclusive")

-        tag = "ROWING MACHINE" if label == 1 else "NOT ROWING MACHINE"
-        print(f"\n  Result: {tag}  (label={label}, confidence={conf:.2f})\n")
+        # Build list of image paths
+        IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff"}
+        if args.dir:
+            dir_path = Path(args.dir)
+            if not dir_path.is_dir():
+                print(f"Error: {args.dir} is not a directory")
+                sys.exit(1)
+            image_paths = sorted(
+                p for p in dir_path.iterdir() if p.suffix.lower() in IMAGE_EXTS
+            )
+            if not image_paths:
+                print(f"No images found in {args.dir}")
+                sys.exit(1)
+        else:
+            image_paths = [Path(args.image)]
+
+        # Classify each image
+        rowing_count = 0
+        for img_path in image_paths:
+            if args.model == "features":
+                label, conf = feature_based_predict(str(img_path), verbose=args.verbose)
+            else:
+                label, conf = cnn_predict(str(img_path), args.model_path)
+
+            tag = "ROWING MACHINE" if label == 1 else "NOT ROWING MACHINE"
+            if args.dir:
+                print(f"  {img_path.name}  \u2192  {tag}  (confidence={conf:.2f})")
+            else:
+                print(f"\n  Result: {tag}  (label={label}, confidence={conf:.2f})\n")
+            if label == 1:
+                rowing_count += 1
+
+        # Summary for directory mode
+        if args.dir:
+            total = len(image_paths)
+            not_rowing = total - rowing_count
+            print(
+                f"\n  Summary: {total} images | {rowing_count} rowing | {not_rowing} not rowing"
+            )

    elif args.command == "train":
        train_cnn(args.data_dir, epochs=args.epochs, lr=args.lr, save_path=args.save)
--- a/screen_classifier_model.pth
+++ b/screen_classifier_model.pth