rnd-v1

2026-05-03 21:58:47 +03:00
parent fd44235ff4
commit e45d1cb6b4
18 changed files with 1259 additions and 0 deletions
--- a/bgtopo_poc/init.py
+++ b/bgtopo_poc/init.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
--- a/bgtopo_poc/cli.py
+++ b/bgtopo_poc/cli.py
@@ -0,0 +1,178 @@
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+
+from .coordinates import extract_coordinate_crops, score_coordinates_for_sheet
+from .detector_cv import detect_sheet, draw_overlay
+from .export_yolo import export_candidates_to_yolo
+from .inventory import discover_original_assets, download_assets, read_manifest_csv, write_manifest_csv
+from .report import build_report
+from .train_yolo import train_yolo
+from .utils import load_yaml, setup_logging
+
+LOG = logging.getLogger(__name__)
+
+
+def cmd_inventory(args):
+    cfg = load_yaml(args.config)
+    base_url = args.base_url or cfg["source"]["base_url"]
+    assets = discover_original_assets(base_url=base_url, include_100k=args.include_100k)
+    if args.limit:
+        assets = assets[: args.limit]
+    write_manifest_csv(assets, args.out)
+
+
+def cmd_download(args):
+    assets = read_manifest_csv(args.manifest)
+    selected = download_assets(assets, args.out_dir, limit=args.limit, overwrite=args.overwrite)
+    write_manifest_csv(selected, args.out_manifest)
+
+
+def cmd_detect(args):
+    cfg = load_yaml(args.config)
+    detect_sheet(args.map, args.tif, args.sheet_id, cfg, args.out_dir)
+
+
+def cmd_overlay(args):
+    draw_overlay(args.tif, args.candidates, args.out)
+
+
+def cmd_score_coords(args):
+    cfg = load_yaml(args.config)
+    score_coordinates_for_sheet(
+        coord_csv=args.coordinates,
+        candidates_csv=args.candidates,
+        map_path=args.map,
+        tif_path=args.tif,
+        sheet_id=args.sheet_id,
+        cfg=cfg,
+        out_dir=args.out_dir,
+        coord_crs=args.coord_crs,
+    )
+
+
+def cmd_crops(args):
+    extract_coordinate_crops(
+        coord_scores_csv=args.scores,
+        map_path=args.map,
+        tif_path=args.tif,
+        out_dir=args.out_dir,
+        crop_size=args.crop_size,
+    )
+
+
+def cmd_export_yolo(args):
+    cfg = load_yaml(args.config)
+    export_candidates_to_yolo(
+        tif_path=args.tif,
+        candidates_csv=args.candidates,
+        out_dir=args.out_dir,
+        cfg=cfg,
+        sheet_id=args.sheet_id,
+        tile_size=args.tile_size,
+        overlap=args.overlap,
+        val_fraction=args.val_fraction,
+    )
+
+
+def cmd_train_yolo(args):
+    train_yolo(args.data_yaml, model=args.model, imgsz=args.imgsz, epochs=args.epochs, batch=args.batch, device=args.device)
+
+
+def cmd_report(args):
+    build_report([Path(p) for p in args.candidates], [Path(p) for p in args.overlays], args.out)
+
+
+def build_parser():
+    p = argparse.ArgumentParser(prog="bgtopo-bluebox", description="BGtopoVJ blue rectangle/square PoC pipeline")
+    p.add_argument("--verbose", action="store_true")
+    sub = p.add_subparsers(dest="cmd", required=True)
+
+    s = sub.add_parser("inventory", help="Crawl original raster directory and create manifest")
+    s.add_argument("--config", default="configs/blue_detector.yaml")
+    s.add_argument("--base-url", default=None)
+    s.add_argument("--out", default="data/manifest.csv")
+    s.add_argument("--limit", type=int, default=None)
+    s.add_argument("--include-100k", action="store_true")
+    s.set_defaults(func=cmd_inventory)
+
+    s = sub.add_parser("download", help="Download .map/.tif pairs from manifest")
+    s.add_argument("--manifest", default="data/manifest.csv")
+    s.add_argument("--out-dir", default="data/raw")
+    s.add_argument("--out-manifest", default="data/manifest_downloaded.csv")
+    s.add_argument("--limit", type=int, default=2)
+    s.add_argument("--overwrite", action="store_true")
+    s.set_defaults(func=cmd_download)
+
+    s = sub.add_parser("detect", help="Detect blue rectangle/square candidates on one sheet")
+    s.add_argument("--config", default="configs/blue_detector.yaml")
+    s.add_argument("--sheet-id", required=True)
+    s.add_argument("--map", default=None)
+    s.add_argument("--tif", required=True)
+    s.add_argument("--out-dir", default="data/interim/candidates")
+    s.set_defaults(func=cmd_detect)
+
+    s = sub.add_parser("overlay", help="Draw candidate overlay for manual QA")
+    s.add_argument("--tif", required=True)
+    s.add_argument("--candidates", required=True)
+    s.add_argument("--out", required=True)
+    s.set_defaults(func=cmd_overlay)
+
+    s = sub.add_parser("score-coords", help="Score coordinate CSV against candidates from one sheet")
+    s.add_argument("--config", default="configs/blue_detector.yaml")
+    s.add_argument("--sheet-id", required=True)
+    s.add_argument("--coordinates", required=True)
+    s.add_argument("--candidates", required=True)
+    s.add_argument("--map", default=None)
+    s.add_argument("--tif", required=True)
+    s.add_argument("--out-dir", default="data/interim/coordinate_scores")
+    s.add_argument("--coord-crs", default="EPSG:4326")
+    s.set_defaults(func=cmd_score_coords)
+
+    s = sub.add_parser("crops", help="Extract review crops around scored coordinates")
+    s.add_argument("--scores", required=True)
+    s.add_argument("--map", default=None)
+    s.add_argument("--tif", required=True)
+    s.add_argument("--out-dir", default="data/interim/crops")
+    s.add_argument("--crop-size", type=int, default=256)
+    s.set_defaults(func=cmd_crops)
+
+    s = sub.add_parser("export-yolo", help="Export weak candidates to YOLO dataset format")
+    s.add_argument("--config", default="configs/blue_detector.yaml")
+    s.add_argument("--sheet-id", required=True)
+    s.add_argument("--tif", required=True)
+    s.add_argument("--candidates", required=True)
+    s.add_argument("--out-dir", default="data/yolo/bluebox")
+    s.add_argument("--tile-size", type=int, default=1024)
+    s.add_argument("--overlap", type=int, default=128)
+    s.add_argument("--val-fraction", type=float, default=0.20)
+    s.set_defaults(func=cmd_export_yolo)
+
+    s = sub.add_parser("train-yolo", help="Train YOLO on exported dataset")
+    s.add_argument("--data-yaml", required=True)
+    s.add_argument("--model", default="yolov8s.pt")
+    s.add_argument("--imgsz", type=int, default=1024)
+    s.add_argument("--epochs", type=int, default=80)
+    s.add_argument("--batch", type=int, default=4)
+    s.add_argument("--device", default="0")
+    s.set_defaults(func=cmd_train_yolo)
+
+    s = sub.add_parser("report", help="Build HTML QA report")
+    s.add_argument("--candidates", nargs="+", required=True)
+    s.add_argument("--overlays", nargs="*", default=[])
+    s.add_argument("--out", default="reports/poc_report.html")
+    s.set_defaults(func=cmd_report)
+    return p
+
+
+def main():
+    parser = build_parser()
+    args = parser.parse_args()
+    setup_logging(args.verbose)
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/bgtopo_poc/coordinates.py
+++ b/bgtopo_poc/coordinates.py
@@ -0,0 +1,150 @@
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import numpy as np
+import pandas as pd
+from PIL import Image, ImageDraw
+from pyproj import Transformer
+from rasterio.windows import Window
+
+from .georef import open_georaster, read_window_rgb
+from .utils import ensure_dir
+
+LOG = logging.getLogger(__name__)
+
+
+def _normalize_coordinate_columns(df: pd.DataFrame) -> pd.DataFrame:
+    cols = {c.lower().strip(): c for c in df.columns}
+    lat_col = cols.get("lat") or cols.get("latitude") or cols.get("y")
+    lon_col = cols.get("lon") or cols.get("lng") or cols.get("longitude") or cols.get("x")
+    if not lat_col or not lon_col:
+        raise ValueError("Coordinate CSV needs lat/lon columns, or latitude/longitude, or y/x.")
+    out = df.copy()
+    out["lat"] = pd.to_numeric(out[lat_col], errors="coerce")
+    out["lon"] = pd.to_numeric(out[lon_col], errors="coerce")
+    if "id" not in out.columns:
+        out["id"] = [f"pt_{i:06d}" for i in range(len(out))]
+    return out.dropna(subset=["lat", "lon"])
+
+
+def load_coordinates(path: str | Path) -> pd.DataFrame:
+    return _normalize_coordinate_columns(pd.read_csv(path))
+
+
+def coord_to_rowcol(ds, lon: float, lat: float, coord_crs: str = "EPSG:4326") -> Optional[tuple[int, int]]:
+    if ds.crs is None:
+        return None
+    try:
+        transformer = Transformer.from_crs(coord_crs, ds.crs, always_xy=True)
+        x, y = transformer.transform(lon, lat)
+        row, col = ds.index(x, y)
+        return int(row), int(col)
+    except Exception as e:  # noqa: BLE001
+        LOG.debug("coord_to_rowcol failed: %s", e)
+        return None
+
+
+def score_coordinates_for_sheet(
+    coord_csv: str | Path,
+    candidates_csv: str | Path,
+    map_path: str | None,
+    tif_path: str | None,
+    sheet_id: str,
+    cfg: Dict,
+    out_dir: str | Path,
+    coord_crs: str = "EPSG:4326",
+) -> Path:
+    out_dir = ensure_dir(out_dir)
+    coords = load_coordinates(coord_csv)
+    cands = pd.read_csv(candidates_csv)
+    rh = open_georaster(map_path=map_path, tif_path=tif_path)
+    radius = float(cfg["coordinate_scoring"].get("search_radius_px", 45))
+    try:
+        rows: List[dict] = []
+        for _, pt in coords.iterrows():
+            rc = coord_to_rowcol(rh.dataset, float(pt.lon), float(pt.lat), coord_crs=coord_crs)
+            if rc is None:
+                continue
+            row, col = rc
+            if row < 0 or col < 0 or row >= rh.height or col >= rh.width:
+                continue
+            if cands.empty:
+                nearest = None
+            else:
+                dx = cands["cx"].astype(float).to_numpy() - col
+                dy = cands["cy"].astype(float).to_numpy() - row
+                dist = np.sqrt(dx * dx + dy * dy)
+                i = int(np.argmin(dist))
+                nearest = (i, float(dist[i]))
+            score = 0.0
+            nearest_id = None
+            nearest_dist = None
+            nearest_style = None
+            nearest_det_score = None
+            decision = "auto_negative"
+            if nearest:
+                i, d = nearest
+                nearest_dist = d
+                if d <= radius:
+                    nearest_id = i
+                    nearest_style = str(cands.iloc[i].get("fill_style", "unknown"))
+                    nearest_det_score = float(cands.iloc[i].get("score", 0.0))
+                    dist_factor = max(0.0, 1.0 - d / radius)
+                    score = float(0.55 * nearest_det_score + 0.45 * dist_factor)
+            if score >= float(cfg["coordinate_scoring"].get("strong_score", 0.90)):
+                decision = "auto_positive"
+            elif score >= float(cfg["coordinate_scoring"].get("weak_score", 0.40)):
+                decision = "review"
+            rows.append({
+                "id": pt.id,
+                "sheet_id": sheet_id,
+                "lat": float(pt.lat),
+                "lon": float(pt.lon),
+                "row": row,
+                "col": col,
+                "nearest_candidate_index": nearest_id,
+                "nearest_candidate_distance_px": nearest_dist,
+                "nearest_candidate_style": nearest_style,
+                "nearest_candidate_score": nearest_det_score,
+                "coordinate_score": score,
+                "decision": decision,
+            })
+        out_csv = Path(out_dir) / f"{sheet_id}_coordinate_scores.csv"
+        pd.DataFrame(rows).to_csv(out_csv, index=False)
+        LOG.info("Wrote coordinate scores: %s", out_csv)
+        return out_csv
+    finally:
+        rh.close()
+
+
+def extract_coordinate_crops(
+    coord_scores_csv: str | Path,
+    map_path: str | None,
+    tif_path: str | None,
+    out_dir: str | Path,
+    crop_size: int = 256,
+    only_decisions: tuple[str, ...] = ("review", "auto_positive"),
+) -> Path:
+    out_dir = ensure_dir(out_dir)
+    df = pd.read_csv(coord_scores_csv)
+    rh = open_georaster(map_path=map_path, tif_path=tif_path)
+    half = crop_size // 2
+    try:
+        for _, r in df.iterrows():
+            if str(r.decision) not in only_decisions:
+                continue
+            row, col = int(r.row), int(r.col)
+            win = Window(col - half, row - half, crop_size, crop_size)
+            rgb = read_window_rgb(rh.dataset, win)
+            img = Image.fromarray(rgb).convert("RGB")
+            draw = ImageDraw.Draw(img)
+            draw.ellipse([half - 5, half - 5, half + 5, half + 5], outline=(255, 0, 0), width=2)
+            name = f"{str(r.id)}__{str(r.decision)}__score_{float(r.coordinate_score):.3f}.png"
+            img.save(Path(out_dir) / name)
+        LOG.info("Wrote crops into: %s", out_dir)
+        return Path(out_dir)
+    finally:
+        rh.close()
--- a/bgtopo_poc/detector_cv.py
+++ b/bgtopo_poc/detector_cv.py
@@ -0,0 +1,223 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+import cv2
+import numpy as np
+import pandas as pd
+from PIL import Image, ImageDraw
+from rasterio.windows import Window
+from tqdm import tqdm
+
+from .georef import iter_windows, open_georaster, pixel_to_lonlat, read_window_rgb
+from .utils import ensure_dir, safe_stem
+
+LOG = logging.getLogger(__name__)
+
+
+@dataclass
+class Candidate:
+    sheet_id: str
+    source_path: str
+    x: int
+    y: int
+    w: int
+    h: int
+    cx: float
+    cy: float
+    area: float
+    aspect: float
+    blue_fill_ratio: float
+    rectangularity: float
+    solidity: float
+    approx_vertices: int
+    fill_style: str
+    score: float
+    lon: float | None = None
+    lat: float | None = None
+
+    def to_dict(self):
+        return asdict(self)
+
+
+def build_blue_mask(rgb: np.ndarray, cfg: Dict) -> np.ndarray:
+    hsv = cv2.cvtColor(rgb, cv2.COLOR_RGB2HSV)
+    mask = np.zeros(hsv.shape[:2], dtype=np.uint8)
+    for r in cfg["detector"].get("hsv_ranges", []):
+        lower = np.array(r["lower"], dtype=np.uint8)
+        upper = np.array(r["upper"], dtype=np.uint8)
+        mask |= cv2.inRange(hsv, lower, upper)
+    morph = cfg["detector"].get("morphology", {})
+    open_k = int(morph.get("open_kernel", 0) or 0)
+    close_k = int(morph.get("close_kernel", 0) or 0)
+    if open_k > 1:
+        k = np.ones((open_k, open_k), np.uint8)
+        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, k)
+    if close_k > 1:
+        k = np.ones((close_k, close_k), np.uint8)
+        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, k)
+    dilate_iter = int(morph.get("dilate_iterations", 0) or 0)
+    if dilate_iter > 0:
+        mask = cv2.dilate(mask, np.ones((2, 2), np.uint8), iterations=dilate_iter)
+    return mask
+
+
+def _classify_fill_style(blue_fill_ratio: float, rectangularity: float, solidity: float) -> str:
+    if blue_fill_ratio >= 0.48 and solidity >= 0.60:
+        return "filled"
+    if 0.10 <= blue_fill_ratio < 0.42 and rectangularity >= 0.28:
+        return "hollow"
+    if blue_fill_ratio < 0.18 and rectangularity >= 0.18:
+        return "border"
+    return "unknown"
+
+
+def _score_candidate(blue_fill_ratio: float, rectangularity: float, solidity: float, aspect: float, approx_vertices: int) -> float:
+    aspect_bonus = 1.0 - min(abs(np.log(max(aspect, 1e-3))), 1.8) / 1.8
+    vertex_bonus = 1.0 if 4 <= approx_vertices <= 8 else 0.55
+    raw = 0.30 * blue_fill_ratio + 0.28 * rectangularity + 0.22 * solidity + 0.12 * aspect_bonus + 0.08 * vertex_bonus
+    return float(max(0.0, min(1.0, raw)))
+
+
+def find_candidates_in_rgb(rgb: np.ndarray, cfg: Dict, sheet_id: str, source_path: str, xoff: int = 0, yoff: int = 0) -> List[Candidate]:
+    det = cfg["detector"]
+    mask = build_blue_mask(rgb, cfg)
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    out: List[Candidate] = []
+
+    for contour in contours:
+        area = float(cv2.contourArea(contour))
+        if area < det["min_area_px"] or area > det["max_area_px"]:
+            continue
+        x, y, w, h = cv2.boundingRect(contour)
+        if w < det["min_width_px"] or h < det["min_height_px"]:
+            continue
+        if w > det["max_width_px"] or h > det["max_height_px"]:
+            continue
+        aspect = float(w / max(h, 1))
+        if not (det["min_aspect"] <= aspect <= det["max_aspect"]):
+            continue
+
+        bbox_mask = mask[y:y + h, x:x + w]
+        blue_fill_ratio = float(np.count_nonzero(bbox_mask) / max(w * h, 1))
+        if not (det["min_blue_fill_ratio"] <= blue_fill_ratio <= det["max_blue_fill_ratio"]):
+            continue
+
+        rectangularity = float(area / max(w * h, 1))
+        if rectangularity < det["min_rectangularity"]:
+            continue
+
+        hull = cv2.convexHull(contour)
+        hull_area = float(cv2.contourArea(hull))
+        solidity = float(area / hull_area) if hull_area > 0 else 0.0
+        if solidity < det["min_solidity"]:
+            continue
+
+        peri = float(cv2.arcLength(contour, True))
+        approx = cv2.approxPolyDP(contour, 0.04 * peri, True)
+        fill_style = _classify_fill_style(blue_fill_ratio, rectangularity, solidity)
+        score = _score_candidate(blue_fill_ratio, rectangularity, solidity, aspect, len(approx))
+        out.append(
+            Candidate(
+                sheet_id=sheet_id,
+                source_path=source_path,
+                x=int(x + xoff),
+                y=int(y + yoff),
+                w=int(w),
+                h=int(h),
+                cx=float(x + xoff + w / 2),
+                cy=float(y + yoff + h / 2),
+                area=area,
+                aspect=aspect,
+                blue_fill_ratio=blue_fill_ratio,
+                rectangularity=rectangularity,
+                solidity=solidity,
+                approx_vertices=int(len(approx)),
+                fill_style=fill_style,
+                score=score,
+            )
+        )
+    return out
+
+
+def bbox_iou(a: Candidate, b: Candidate) -> float:
+    ax1, ay1, ax2, ay2 = a.x, a.y, a.x + a.w, a.y + a.h
+    bx1, by1, bx2, by2 = b.x, b.y, b.x + b.w, b.y + b.h
+    ix1, iy1 = max(ax1, bx1), max(ay1, by1)
+    ix2, iy2 = min(ax2, bx2), min(ay2, by2)
+    iw, ih = max(0, ix2 - ix1), max(0, iy2 - iy1)
+    inter = iw * ih
+    union = a.w * a.h + b.w * b.h - inter
+    return float(inter / union) if union else 0.0
+
+
+def nms(cands: Iterable[Candidate], threshold: float) -> List[Candidate]:
+    items = sorted(cands, key=lambda c: c.score, reverse=True)
+    kept: List[Candidate] = []
+    for c in items:
+        if all(bbox_iou(c, k) < threshold for k in kept):
+            kept.append(c)
+    return kept
+
+
+def detect_sheet(map_path: str | None, tif_path: str | None, sheet_id: str, cfg: Dict, out_dir: str | Path) -> Path:
+    out_dir = ensure_dir(out_dir)
+    rh = open_georaster(map_path=map_path, tif_path=tif_path)
+    try:
+        tile_size = int(cfg["detector"]["tile_size"])
+        overlap = int(cfg["detector"]["tile_overlap"])
+        all_candidates: List[Candidate] = []
+        windows = list(iter_windows(rh.width, rh.height, tile_size, overlap))
+        LOG.info("Scanning %s as %d windows (%dx%d px)", sheet_id, len(windows), rh.width, rh.height)
+        for win in tqdm(windows, desc=f"scan {sheet_id}"):
+            rgb = read_window_rgb(rh.dataset, win)
+            cands = find_candidates_in_rgb(
+                rgb=rgb,
+                cfg=cfg,
+                sheet_id=sheet_id,
+                source_path=str(rh.path),
+                xoff=int(win.col_off),
+                yoff=int(win.row_off),
+            )
+            all_candidates.extend(cands)
+        kept = nms(all_candidates, float(cfg["detector"].get("nms_iou_threshold", 0.25)))
+        # Attach georeferenced centers when possible.
+        for c in kept:
+            if rh.crs is not None:
+                xgeo, ygeo = pixel_to_lonlat(rh.dataset, int(c.cy), int(c.cx))
+                c.lon = xgeo
+                c.lat = ygeo
+        out_csv = out_dir / f"{sheet_id}_candidates.csv"
+        pd.DataFrame([c.to_dict() for c in kept]).to_csv(out_csv, index=False)
+        LOG.info("%s: wrote %d candidates to %s", sheet_id, len(kept), out_csv)
+        return out_csv
+    finally:
+        rh.close()
+
+
+def draw_overlay(tif_path: str | Path, candidates_csv: str | Path, out_png: str | Path, max_side: int = 2400) -> Path:
+    """Draw candidates on a downscaled image. Uses TIFF path for simple visual QA."""
+    img = Image.open(tif_path).convert("RGB")
+    scale = min(1.0, max_side / max(img.size))
+    disp = img.resize((int(img.width * scale), int(img.height * scale))) if scale < 1 else img.copy()
+    draw = ImageDraw.Draw(disp)
+    df = pd.read_csv(candidates_csv)
+    color_by_style = {
+        "filled": (0, 255, 255),
+        "hollow": (0, 120, 255),
+        "border": (20, 20, 255),
+        "unknown": (255, 255, 0),
+    }
+    for _, r in df.iterrows():
+        color = color_by_style.get(str(r.get("fill_style", "unknown")), (255, 255, 0))
+        x1, y1 = float(r.x) * scale, float(r.y) * scale
+        x2, y2 = float(r.x + r.w) * scale, float(r.y + r.h) * scale
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=max(1, int(2 * scale + 1)))
+    out_png = Path(out_png)
+    ensure_dir(out_png.parent)
+    disp.save(out_png)
+    LOG.info("Wrote overlay: %s", out_png)
+    return out_png
--- a/bgtopo_poc/export_yolo.py
+++ b/bgtopo_poc/export_yolo.py
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+import logging
+import random
+import shutil
+from pathlib import Path
+from typing import Dict
+
+import pandas as pd
+from PIL import Image
+
+from .utils import ensure_dir
+
+LOG = logging.getLogger(__name__)
+
+
+STYLE_TO_CLASS = {
+    "unknown": 0,
+    "filled": 1,
+    "hollow": 2,
+    "border": 3,
+}
+
+
+def _crop_image_and_labels(img: Image.Image, boxes: pd.DataFrame, x0: int, y0: int, size: int):
+    crop = img.crop((x0, y0, x0 + size, y0 + size)).convert("RGB")
+    labels = []
+    for _, r in boxes.iterrows():
+        bx1, by1, bx2, by2 = float(r.x), float(r.y), float(r.x + r.w), float(r.y + r.h)
+        ix1, iy1 = max(bx1, x0), max(by1, y0)
+        ix2, iy2 = min(bx2, x0 + size), min(by2, y0 + size)
+        if ix2 <= ix1 or iy2 <= iy1:
+            continue
+        visible_area = (ix2 - ix1) * (iy2 - iy1)
+        box_area = max((bx2 - bx1) * (by2 - by1), 1)
+        if visible_area / box_area < 0.35:
+            continue
+        cx = ((ix1 + ix2) / 2 - x0) / size
+        cy = ((iy1 + iy2) / 2 - y0) / size
+        w = (ix2 - ix1) / size
+        h = (iy2 - iy1) / size
+        cls = STYLE_TO_CLASS.get(str(r.get("fill_style", "unknown")), 0)
+        labels.append(f"{cls} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}")
+    return crop, labels
+
+
+def export_candidates_to_yolo(
+    tif_path: str | Path,
+    candidates_csv: str | Path,
+    out_dir: str | Path,
+    cfg: Dict,
+    sheet_id: str,
+    tile_size: int = 1024,
+    overlap: int = 128,
+    val_fraction: float = 0.20,
+    include_empty_tiles: bool = True,
+    max_empty_tiles: int = 250,
+) -> Path:
+    out_dir = Path(out_dir)
+    for split in ["train", "val"]:
+        ensure_dir(out_dir / "images" / split)
+        ensure_dir(out_dir / "labels" / split)
+
+    img = Image.open(tif_path).convert("RGB")
+    boxes = pd.read_csv(candidates_csv)
+    step = max(1, tile_size - overlap)
+    random.seed(42)
+    empty_written = 0
+    total_written = 0
+
+    for y0 in range(0, max(1, img.height - tile_size + 1), step):
+        for x0 in range(0, max(1, img.width - tile_size + 1), step):
+            in_tile = boxes[
+                (boxes.cx >= x0) & (boxes.cx < x0 + tile_size) &
+                (boxes.cy >= y0) & (boxes.cy < y0 + tile_size)
+            ]
+            if in_tile.empty:
+                if not include_empty_tiles or empty_written >= max_empty_tiles:
+                    continue
+                # Keep some empty/hard-negative tiles to stop the model from detecting all blue map details.
+                if random.random() > 0.08:
+                    continue
+                empty_written += 1
+            crop, labels = _crop_image_and_labels(img, boxes, x0, y0, tile_size)
+            split = "val" if random.random() < val_fraction else "train"
+            stem = f"{sheet_id}_{x0}_{y0}"
+            crop.save(out_dir / "images" / split / f"{stem}.jpg", quality=92)
+            with open(out_dir / "labels" / split / f"{stem}.txt", "w", encoding="utf-8") as f:
+                f.write("\n".join(labels))
+            total_written += 1
+
+    data_yaml = out_dir / "data.yaml"
+    names = cfg.get("export", {}).get("yolo_class_names", ["blue_rect_unknown", "blue_rect_filled", "blue_rect_hollow", "blue_rect_border"])
+    with open(data_yaml, "w", encoding="utf-8") as f:
+        f.write(f"path: {out_dir.resolve()}\n")
+        f.write("train: images/train\n")
+        f.write("val: images/val\n")
+        f.write("names:\n")
+        for i, name in enumerate(names):
+            f.write(f"  {i}: {name}\n")
+    LOG.info("YOLO export complete: %s (%d tiles)", data_yaml, total_written)
+    return data_yaml
+
+
+def merge_yolo_datasets(src_dirs: list[str | Path], out_dir: str | Path) -> Path:
+    out_dir = Path(out_dir)
+    for split in ["train", "val"]:
+        ensure_dir(out_dir / "images" / split)
+        ensure_dir(out_dir / "labels" / split)
+    for src in src_dirs:
+        src = Path(src)
+        for split in ["train", "val"]:
+            for img in (src / "images" / split).glob("*.jpg"):
+                shutil.copy2(img, out_dir / "images" / split / img.name)
+            for lab in (src / "labels" / split).glob("*.txt"):
+                shutil.copy2(lab, out_dir / "labels" / split / lab.name)
+    return out_dir
--- a/bgtopo_poc/georef.py
+++ b/bgtopo_poc/georef.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Tuple
+
+import numpy as np
+import rasterio
+from rasterio.errors import RasterioIOError
+from rasterio.transform import rowcol, xy
+from rasterio.windows import Window
+
+LOG = logging.getLogger(__name__)
+
+
+@dataclass
+class RasterHandle:
+    path: Path
+    dataset: rasterio.io.DatasetReader
+    used_map_file: bool
+
+    @property
+    def width(self) -> int:
+        return self.dataset.width
+
+    @property
+    def height(self) -> int:
+        return self.dataset.height
+
+    @property
+    def crs(self):
+        return self.dataset.crs
+
+    @property
+    def transform(self):
+        return self.dataset.transform
+
+    def close(self) -> None:
+        self.dataset.close()
+
+
+def open_georaster(map_path: Optional[str | Path] = None, tif_path: Optional[str | Path] = None) -> RasterHandle:
+    """Open .map if possible, otherwise .tif. .map gives georeferencing through GDAL's MAP driver."""
+    last_err = None
+    if map_path:
+        try:
+            p = Path(map_path)
+            ds = rasterio.open(p)
+            LOG.info("Opened georeferenced MAP dataset: %s", p)
+            return RasterHandle(p, ds, used_map_file=True)
+        except Exception as e:  # noqa: BLE001
+            last_err = e
+            LOG.warning("Could not open MAP dataset %s: %s", map_path, e)
+    if tif_path:
+        try:
+            p = Path(tif_path)
+            ds = rasterio.open(p)
+            LOG.info("Opened TIFF dataset: %s", p)
+            return RasterHandle(p, ds, used_map_file=False)
+        except RasterioIOError as e:
+            last_err = e
+    raise RuntimeError(f"Could not open raster. Last error: {last_err}")
+
+
+def read_window_rgb(ds: rasterio.io.DatasetReader, window: Window) -> np.ndarray:
+    """Read a raster window as uint8 RGB HxWx3."""
+    arr = ds.read(window=window, boundless=True, fill_value=255)
+    if arr.ndim != 3:
+        raise ValueError(f"Expected band-first array, got shape={arr.shape}")
+    if arr.shape[0] >= 3:
+        arr = arr[:3]
+    elif arr.shape[0] == 1:
+        arr = np.repeat(arr, 3, axis=0)
+    arr = np.moveaxis(arr, 0, -1)
+    if arr.dtype != np.uint8:
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+    return arr
+
+
+def iter_windows(width: int, height: int, tile_size: int, overlap: int):
+    step = max(1, tile_size - overlap)
+    y = 0
+    while y < height:
+        x = 0
+        h = min(tile_size, height - y)
+        while x < width:
+            w = min(tile_size, width - x)
+            yield Window(x, y, w, h)
+            if x + tile_size >= width:
+                break
+            x += step
+        if y + tile_size >= height:
+            break
+        y += step
+
+
+def lonlat_to_pixel(ds: rasterio.io.DatasetReader, lon: float, lat: float) -> Tuple[int, int]:
+    """Convert lon/lat to row/col. Assumes the raster CRS accepts lon/lat or GDAL handles geographic transform.
+
+    For a production version, reproject coordinates into ds.crs with pyproj first. The PoC does that in coordinates.py.
+    """
+    row, col = rowcol(ds.transform, lon, lat)
+    return int(row), int(col)
+
+
+def pixel_to_lonlat(ds: rasterio.io.DatasetReader, row: int, col: int) -> Tuple[float, float]:
+    x, y = xy(ds.transform, row, col, offset="center")
+    return float(x), float(y)
+
+
+def has_real_georef(ds: rasterio.io.DatasetReader) -> bool:
+    try:
+        return ds.crs is not None and not ds.transform.is_identity
+    except Exception:
+        return False
--- a/bgtopo_poc/inventory.py
+++ b/bgtopo_poc/inventory.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Iterable, List, Optional
+from urllib.parse import urljoin
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+from .utils import ensure_dir
+
+LOG = logging.getLogger(__name__)
+
+
+@dataclass
+class SheetAsset:
+    sheet_id: str
+    map_url: Optional[str]
+    tif_url: Optional[str]
+    map_path: Optional[str] = None
+    tif_path: Optional[str] = None
+
+    def to_dict(self):
+        return asdict(self)
+
+
+def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
+    """Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
+    LOG.info("Discovering assets from %s", base_url)
+    html = requests.get(base_url, timeout=60).text
+    soup = BeautifulSoup(html, "html.parser")
+    hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]
+
+    by_sheet: dict[str, SheetAsset] = {}
+    for href in hrefs:
+        if href.startswith("?") or href.startswith("/") or href == "../":
+            continue
+        if not include_100k and "100k" in href.lower():
+            continue
+        if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
+            continue
+        sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
+        item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
+        full_url = urljoin(base_url, href)
+        if href.lower().endswith(".map"):
+            item.map_url = full_url
+        else:
+            item.tif_url = full_url
+
+    assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
+    assets.sort(key=lambda x: x.sheet_id)
+    LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
+    return assets
+
+
+def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
+    rows = [a.to_dict() for a in assets]
+    out_csv = Path(out_csv)
+    ensure_dir(out_csv.parent)
+    pd.DataFrame(rows).to_csv(out_csv, index=False)
+    LOG.info("Wrote manifest: %s", out_csv)
+    return out_csv
+
+
+def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
+    df = pd.read_csv(path).fillna("")
+    assets: List[SheetAsset] = []
+    for _, r in df.iterrows():
+        assets.append(
+            SheetAsset(
+                sheet_id=str(r["sheet_id"]),
+                map_url=str(r.get("map_url") or "") or None,
+                tif_url=str(r.get("tif_url") or "") or None,
+                map_path=str(r.get("map_path") or "") or None,
+                tif_path=str(r.get("tif_path") or "") or None,
+            )
+        )
+    return assets
+
+
+def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
+    if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
+        return out_path
+    ensure_dir(out_path.parent)
+    with requests.get(url, stream=True, timeout=120) as r:
+        r.raise_for_status()
+        total = int(r.headers.get("content-length", "0") or 0)
+        with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
+            for chunk in r.iter_content(chunk_size=1024 * 512):
+                if chunk:
+                    f.write(chunk)
+                    pbar.update(len(chunk))
+    return out_path
+
+
+def download_assets(
+    assets: List[SheetAsset],
+    out_dir: str | Path,
+    limit: Optional[int] = None,
+    overwrite: bool = False,
+) -> List[SheetAsset]:
+    out_dir = Path(out_dir)
+    selected = assets[:limit] if limit else assets
+    for item in selected:
+        sheet_dir = out_dir / item.sheet_id
+        if item.map_url:
+            item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
+        if item.tif_url:
+            item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
+    return selected
--- a/bgtopo_poc/report.py
+++ b/bgtopo_poc/report.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+from jinja2 import Template
+
+from .utils import ensure_dir
+
+REPORT_TEMPLATE = """
+<!doctype html>
+<html>
+<head>
+  <meta charset="utf-8" />
+  <title>BGtopoVJ Blue Box PoC Report</title>
+  <style>
+    body { font-family: Arial, sans-serif; max-width: 1200px; margin: 24px auto; line-height: 1.45; }
+    table { border-collapse: collapse; width: 100%; margin: 16px 0; }
+    th, td { border: 1px solid #ddd; padding: 8px; font-size: 14px; }
+    th { background: #f3f3f3; text-align: left; }
+    .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 18px; }
+    .card { border: 1px solid #ddd; border-radius: 12px; padding: 12px; }
+    img { max-width: 100%; border: 1px solid #ccc; }
+    code { background: #f6f6f6; padding: 2px 4px; }
+  </style>
+</head>
+<body>
+<h1>BGtopoVJ Blue Rectangle/Square PoC Report</h1>
+<p>This is a weak-label mining report. Treat candidates as review targets, not truth.</p>
+
+<h2>Candidate summary</h2>
+<table>
+<tr><th>Metric</th><th>Value</th></tr>
+<tr><td>Total candidates</td><td>{{ total }}</td></tr>
+<tr><td>Average score</td><td>{{ avg_score }}</td></tr>
+<tr><td>Median score</td><td>{{ med_score }}</td></tr>
+</table>
+
+<h2>By inferred fill style</h2>
+{{ style_table }}
+
+<h2>QA overlays</h2>
+<div class="grid">
+{% for overlay in overlays %}
+  <div class="card">
+    <p><code>{{ overlay.name }}</code></p>
+    <img src="{{ overlay.rel }}" />
+  </div>
+{% endfor %}
+</div>
+</body>
+</html>
+"""
+
+
+def build_report(candidate_csvs: list[str | Path], overlays: list[str | Path], out_html: str | Path) -> Path:
+    frames = [pd.read_csv(p) for p in candidate_csvs if Path(p).exists()]
+    df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
+    out_html = Path(out_html)
+    ensure_dir(out_html.parent)
+    style_table = "<p>No candidates.</p>"
+    if not df.empty:
+        style_table = df.groupby("fill_style").agg(count=("fill_style", "size"), avg_score=("score", "mean")).reset_index().to_html(index=False)
+    overlay_items = []
+    for ov in overlays:
+        ov = Path(ov)
+        try:
+            rel = ov.relative_to(out_html.parent)
+        except ValueError:
+            rel = ov
+        overlay_items.append({"name": ov.name, "rel": str(rel).replace("\\", "/")})
+    html = Template(REPORT_TEMPLATE).render(
+        total=0 if df.empty else len(df),
+        avg_score="—" if df.empty else f"{df['score'].mean():.3f}",
+        med_score="—" if df.empty else f"{df['score'].median():.3f}",
+        style_table=style_table,
+        overlays=overlay_items,
+    )
+    out_html.write_text(html, encoding="utf-8")
+    return out_html
--- a/bgtopo_poc/train_yolo.py
+++ b/bgtopo_poc/train_yolo.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+LOG = logging.getLogger(__name__)
+
+
+def train_yolo(data_yaml: str | Path, model: str = "yolov8s.pt", imgsz: int = 1024, epochs: int = 80, batch: int = 4, device: str = "0"):
+    """Train YOLO on the generated weak-label dataset.
+
+    This function imports ultralytics lazily so the rest of the PoC works without GPU dependencies.
+    Review/correct the weak labels before treating this model as useful.
+    """
+    from ultralytics import YOLO
+
+    yolo = YOLO(model)
+    LOG.info("Starting YOLO training: model=%s data=%s imgsz=%d epochs=%d batch=%d device=%s", model, data_yaml, imgsz, epochs, batch, device)
+    return yolo.train(
+        data=str(data_yaml),
+        imgsz=imgsz,
+        epochs=epochs,
+        batch=batch,
+        device=device,
+        workers=4,
+        cache=False,
+        patience=20,
+        project="runs/bgtopo_bluebox",
+        name=f"{Path(data_yaml).parent.name}_{Path(model).stem}",
+        hsv_h=0.005,
+        hsv_s=0.20,
+        hsv_v=0.18,
+        degrees=0.0,
+        translate=0.05,
+        scale=0.20,
+        fliplr=0.5,
+        flipud=0.5,
+        mosaic=0.25,
+        close_mosaic=15,
+    )
--- a/bgtopo_poc/utils.py
+++ b/bgtopo_poc/utils.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any, Dict
+
+import yaml
+
+
+def setup_logging(verbose: bool = False) -> None:
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format="%(asctime)s | %(levelname)-8s | %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+
+def load_yaml(path: str | Path) -> Dict[str, Any]:
+    with open(path, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f) or {}
+
+
+def ensure_dir(path: str | Path) -> Path:
+    p = Path(path)
+    p.mkdir(parents=True, exist_ok=True)
+    return p
+
+
+def write_json(path: str | Path, data: Any) -> None:
+    p = Path(path)
+    ensure_dir(p.parent)
+    with open(p, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+
+def read_json(path: str | Path) -> Any:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def safe_stem(name: str) -> str:
+    return Path(name).stem.replace("/", "_").replace("\\", "_")