diff --git a/bgtopo_poc/__init__.py b/bgtopo_poc/__init__.py new file mode 100644 index 0000000..3dc1f76 --- /dev/null +++ b/bgtopo_poc/__init__.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/bgtopo_poc/cli.py b/bgtopo_poc/cli.py new file mode 100644 index 0000000..d846f63 --- /dev/null +++ b/bgtopo_poc/cli.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +import argparse +import logging +from pathlib import Path + +from .coordinates import extract_coordinate_crops, score_coordinates_for_sheet +from .detector_cv import detect_sheet, draw_overlay +from .export_yolo import export_candidates_to_yolo +from .inventory import discover_original_assets, download_assets, read_manifest_csv, write_manifest_csv +from .report import build_report +from .train_yolo import train_yolo +from .utils import load_yaml, setup_logging + +LOG = logging.getLogger(__name__) + + +def cmd_inventory(args): + cfg = load_yaml(args.config) + base_url = args.base_url or cfg["source"]["base_url"] + assets = discover_original_assets(base_url=base_url, include_100k=args.include_100k) + if args.limit: + assets = assets[: args.limit] + write_manifest_csv(assets, args.out) + + +def cmd_download(args): + assets = read_manifest_csv(args.manifest) + selected = download_assets(assets, args.out_dir, limit=args.limit, overwrite=args.overwrite) + write_manifest_csv(selected, args.out_manifest) + + +def cmd_detect(args): + cfg = load_yaml(args.config) + detect_sheet(args.map, args.tif, args.sheet_id, cfg, args.out_dir) + + +def cmd_overlay(args): + draw_overlay(args.tif, args.candidates, args.out) + + +def cmd_score_coords(args): + cfg = load_yaml(args.config) + score_coordinates_for_sheet( + coord_csv=args.coordinates, + candidates_csv=args.candidates, + map_path=args.map, + tif_path=args.tif, + sheet_id=args.sheet_id, + cfg=cfg, + out_dir=args.out_dir, + coord_crs=args.coord_crs, + ) + + +def cmd_crops(args): + extract_coordinate_crops( + coord_scores_csv=args.scores, + map_path=args.map, + tif_path=args.tif, + out_dir=args.out_dir, + crop_size=args.crop_size, + ) + + +def cmd_export_yolo(args): + cfg = load_yaml(args.config) + export_candidates_to_yolo( + tif_path=args.tif, + candidates_csv=args.candidates, + out_dir=args.out_dir, + cfg=cfg, + sheet_id=args.sheet_id, + tile_size=args.tile_size, + overlap=args.overlap, + val_fraction=args.val_fraction, + ) + + +def cmd_train_yolo(args): + train_yolo(args.data_yaml, model=args.model, imgsz=args.imgsz, epochs=args.epochs, batch=args.batch, device=args.device) + + +def cmd_report(args): + build_report([Path(p) for p in args.candidates], [Path(p) for p in args.overlays], args.out) + + +def build_parser(): + p = argparse.ArgumentParser(prog="bgtopo-bluebox", description="BGtopoVJ blue rectangle/square PoC pipeline") + p.add_argument("--verbose", action="store_true") + sub = p.add_subparsers(dest="cmd", required=True) + + s = sub.add_parser("inventory", help="Crawl original raster directory and create manifest") + s.add_argument("--config", default="configs/blue_detector.yaml") + s.add_argument("--base-url", default=None) + s.add_argument("--out", default="data/manifest.csv") + s.add_argument("--limit", type=int, default=None) + s.add_argument("--include-100k", action="store_true") + s.set_defaults(func=cmd_inventory) + + s = sub.add_parser("download", help="Download .map/.tif pairs from manifest") + s.add_argument("--manifest", default="data/manifest.csv") + s.add_argument("--out-dir", default="data/raw") + s.add_argument("--out-manifest", default="data/manifest_downloaded.csv") + s.add_argument("--limit", type=int, default=2) + s.add_argument("--overwrite", action="store_true") + s.set_defaults(func=cmd_download) + + s = sub.add_parser("detect", help="Detect blue rectangle/square candidates on one sheet") + s.add_argument("--config", default="configs/blue_detector.yaml") + s.add_argument("--sheet-id", required=True) + s.add_argument("--map", default=None) + s.add_argument("--tif", required=True) + s.add_argument("--out-dir", default="data/interim/candidates") + s.set_defaults(func=cmd_detect) + + s = sub.add_parser("overlay", help="Draw candidate overlay for manual QA") + s.add_argument("--tif", required=True) + s.add_argument("--candidates", required=True) + s.add_argument("--out", required=True) + s.set_defaults(func=cmd_overlay) + + s = sub.add_parser("score-coords", help="Score coordinate CSV against candidates from one sheet") + s.add_argument("--config", default="configs/blue_detector.yaml") + s.add_argument("--sheet-id", required=True) + s.add_argument("--coordinates", required=True) + s.add_argument("--candidates", required=True) + s.add_argument("--map", default=None) + s.add_argument("--tif", required=True) + s.add_argument("--out-dir", default="data/interim/coordinate_scores") + s.add_argument("--coord-crs", default="EPSG:4326") + s.set_defaults(func=cmd_score_coords) + + s = sub.add_parser("crops", help="Extract review crops around scored coordinates") + s.add_argument("--scores", required=True) + s.add_argument("--map", default=None) + s.add_argument("--tif", required=True) + s.add_argument("--out-dir", default="data/interim/crops") + s.add_argument("--crop-size", type=int, default=256) + s.set_defaults(func=cmd_crops) + + s = sub.add_parser("export-yolo", help="Export weak candidates to YOLO dataset format") + s.add_argument("--config", default="configs/blue_detector.yaml") + s.add_argument("--sheet-id", required=True) + s.add_argument("--tif", required=True) + s.add_argument("--candidates", required=True) + s.add_argument("--out-dir", default="data/yolo/bluebox") + s.add_argument("--tile-size", type=int, default=1024) + s.add_argument("--overlap", type=int, default=128) + s.add_argument("--val-fraction", type=float, default=0.20) + s.set_defaults(func=cmd_export_yolo) + + s = sub.add_parser("train-yolo", help="Train YOLO on exported dataset") + s.add_argument("--data-yaml", required=True) + s.add_argument("--model", default="yolov8s.pt") + s.add_argument("--imgsz", type=int, default=1024) + s.add_argument("--epochs", type=int, default=80) + s.add_argument("--batch", type=int, default=4) + s.add_argument("--device", default="0") + s.set_defaults(func=cmd_train_yolo) + + s = sub.add_parser("report", help="Build HTML QA report") + s.add_argument("--candidates", nargs="+", required=True) + s.add_argument("--overlays", nargs="*", default=[]) + s.add_argument("--out", default="reports/poc_report.html") + s.set_defaults(func=cmd_report) + return p + + +def main(): + parser = build_parser() + args = parser.parse_args() + setup_logging(args.verbose) + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/bgtopo_poc/coordinates.py b/bgtopo_poc/coordinates.py new file mode 100644 index 0000000..8d4155b --- /dev/null +++ b/bgtopo_poc/coordinates.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd +from PIL import Image, ImageDraw +from pyproj import Transformer +from rasterio.windows import Window + +from .georef import open_georaster, read_window_rgb +from .utils import ensure_dir + +LOG = logging.getLogger(__name__) + + +def _normalize_coordinate_columns(df: pd.DataFrame) -> pd.DataFrame: + cols = {c.lower().strip(): c for c in df.columns} + lat_col = cols.get("lat") or cols.get("latitude") or cols.get("y") + lon_col = cols.get("lon") or cols.get("lng") or cols.get("longitude") or cols.get("x") + if not lat_col or not lon_col: + raise ValueError("Coordinate CSV needs lat/lon columns, or latitude/longitude, or y/x.") + out = df.copy() + out["lat"] = pd.to_numeric(out[lat_col], errors="coerce") + out["lon"] = pd.to_numeric(out[lon_col], errors="coerce") + if "id" not in out.columns: + out["id"] = [f"pt_{i:06d}" for i in range(len(out))] + return out.dropna(subset=["lat", "lon"]) + + +def load_coordinates(path: str | Path) -> pd.DataFrame: + return _normalize_coordinate_columns(pd.read_csv(path)) + + +def coord_to_rowcol(ds, lon: float, lat: float, coord_crs: str = "EPSG:4326") -> Optional[tuple[int, int]]: + if ds.crs is None: + return None + try: + transformer = Transformer.from_crs(coord_crs, ds.crs, always_xy=True) + x, y = transformer.transform(lon, lat) + row, col = ds.index(x, y) + return int(row), int(col) + except Exception as e: # noqa: BLE001 + LOG.debug("coord_to_rowcol failed: %s", e) + return None + + +def score_coordinates_for_sheet( + coord_csv: str | Path, + candidates_csv: str | Path, + map_path: str | None, + tif_path: str | None, + sheet_id: str, + cfg: Dict, + out_dir: str | Path, + coord_crs: str = "EPSG:4326", +) -> Path: + out_dir = ensure_dir(out_dir) + coords = load_coordinates(coord_csv) + cands = pd.read_csv(candidates_csv) + rh = open_georaster(map_path=map_path, tif_path=tif_path) + radius = float(cfg["coordinate_scoring"].get("search_radius_px", 45)) + try: + rows: List[dict] = [] + for _, pt in coords.iterrows(): + rc = coord_to_rowcol(rh.dataset, float(pt.lon), float(pt.lat), coord_crs=coord_crs) + if rc is None: + continue + row, col = rc + if row < 0 or col < 0 or row >= rh.height or col >= rh.width: + continue + if cands.empty: + nearest = None + else: + dx = cands["cx"].astype(float).to_numpy() - col + dy = cands["cy"].astype(float).to_numpy() - row + dist = np.sqrt(dx * dx + dy * dy) + i = int(np.argmin(dist)) + nearest = (i, float(dist[i])) + score = 0.0 + nearest_id = None + nearest_dist = None + nearest_style = None + nearest_det_score = None + decision = "auto_negative" + if nearest: + i, d = nearest + nearest_dist = d + if d <= radius: + nearest_id = i + nearest_style = str(cands.iloc[i].get("fill_style", "unknown")) + nearest_det_score = float(cands.iloc[i].get("score", 0.0)) + dist_factor = max(0.0, 1.0 - d / radius) + score = float(0.55 * nearest_det_score + 0.45 * dist_factor) + if score >= float(cfg["coordinate_scoring"].get("strong_score", 0.90)): + decision = "auto_positive" + elif score >= float(cfg["coordinate_scoring"].get("weak_score", 0.40)): + decision = "review" + rows.append({ + "id": pt.id, + "sheet_id": sheet_id, + "lat": float(pt.lat), + "lon": float(pt.lon), + "row": row, + "col": col, + "nearest_candidate_index": nearest_id, + "nearest_candidate_distance_px": nearest_dist, + "nearest_candidate_style": nearest_style, + "nearest_candidate_score": nearest_det_score, + "coordinate_score": score, + "decision": decision, + }) + out_csv = Path(out_dir) / f"{sheet_id}_coordinate_scores.csv" + pd.DataFrame(rows).to_csv(out_csv, index=False) + LOG.info("Wrote coordinate scores: %s", out_csv) + return out_csv + finally: + rh.close() + + +def extract_coordinate_crops( + coord_scores_csv: str | Path, + map_path: str | None, + tif_path: str | None, + out_dir: str | Path, + crop_size: int = 256, + only_decisions: tuple[str, ...] = ("review", "auto_positive"), +) -> Path: + out_dir = ensure_dir(out_dir) + df = pd.read_csv(coord_scores_csv) + rh = open_georaster(map_path=map_path, tif_path=tif_path) + half = crop_size // 2 + try: + for _, r in df.iterrows(): + if str(r.decision) not in only_decisions: + continue + row, col = int(r.row), int(r.col) + win = Window(col - half, row - half, crop_size, crop_size) + rgb = read_window_rgb(rh.dataset, win) + img = Image.fromarray(rgb).convert("RGB") + draw = ImageDraw.Draw(img) + draw.ellipse([half - 5, half - 5, half + 5, half + 5], outline=(255, 0, 0), width=2) + name = f"{str(r.id)}__{str(r.decision)}__score_{float(r.coordinate_score):.3f}.png" + img.save(Path(out_dir) / name) + LOG.info("Wrote crops into: %s", out_dir) + return Path(out_dir) + finally: + rh.close() diff --git a/bgtopo_poc/detector_cv.py b/bgtopo_poc/detector_cv.py new file mode 100644 index 0000000..321807e --- /dev/null +++ b/bgtopo_poc/detector_cv.py @@ -0,0 +1,223 @@ +from __future__ import annotations + +import logging +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + +import cv2 +import numpy as np +import pandas as pd +from PIL import Image, ImageDraw +from rasterio.windows import Window +from tqdm import tqdm + +from .georef import iter_windows, open_georaster, pixel_to_lonlat, read_window_rgb +from .utils import ensure_dir, safe_stem + +LOG = logging.getLogger(__name__) + + +@dataclass +class Candidate: + sheet_id: str + source_path: str + x: int + y: int + w: int + h: int + cx: float + cy: float + area: float + aspect: float + blue_fill_ratio: float + rectangularity: float + solidity: float + approx_vertices: int + fill_style: str + score: float + lon: float | None = None + lat: float | None = None + + def to_dict(self): + return asdict(self) + + +def build_blue_mask(rgb: np.ndarray, cfg: Dict) -> np.ndarray: + hsv = cv2.cvtColor(rgb, cv2.COLOR_RGB2HSV) + mask = np.zeros(hsv.shape[:2], dtype=np.uint8) + for r in cfg["detector"].get("hsv_ranges", []): + lower = np.array(r["lower"], dtype=np.uint8) + upper = np.array(r["upper"], dtype=np.uint8) + mask |= cv2.inRange(hsv, lower, upper) + morph = cfg["detector"].get("morphology", {}) + open_k = int(morph.get("open_kernel", 0) or 0) + close_k = int(morph.get("close_kernel", 0) or 0) + if open_k > 1: + k = np.ones((open_k, open_k), np.uint8) + mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, k) + if close_k > 1: + k = np.ones((close_k, close_k), np.uint8) + mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, k) + dilate_iter = int(morph.get("dilate_iterations", 0) or 0) + if dilate_iter > 0: + mask = cv2.dilate(mask, np.ones((2, 2), np.uint8), iterations=dilate_iter) + return mask + + +def _classify_fill_style(blue_fill_ratio: float, rectangularity: float, solidity: float) -> str: + if blue_fill_ratio >= 0.48 and solidity >= 0.60: + return "filled" + if 0.10 <= blue_fill_ratio < 0.42 and rectangularity >= 0.28: + return "hollow" + if blue_fill_ratio < 0.18 and rectangularity >= 0.18: + return "border" + return "unknown" + + +def _score_candidate(blue_fill_ratio: float, rectangularity: float, solidity: float, aspect: float, approx_vertices: int) -> float: + aspect_bonus = 1.0 - min(abs(np.log(max(aspect, 1e-3))), 1.8) / 1.8 + vertex_bonus = 1.0 if 4 <= approx_vertices <= 8 else 0.55 + raw = 0.30 * blue_fill_ratio + 0.28 * rectangularity + 0.22 * solidity + 0.12 * aspect_bonus + 0.08 * vertex_bonus + return float(max(0.0, min(1.0, raw))) + + +def find_candidates_in_rgb(rgb: np.ndarray, cfg: Dict, sheet_id: str, source_path: str, xoff: int = 0, yoff: int = 0) -> List[Candidate]: + det = cfg["detector"] + mask = build_blue_mask(rgb, cfg) + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + out: List[Candidate] = [] + + for contour in contours: + area = float(cv2.contourArea(contour)) + if area < det["min_area_px"] or area > det["max_area_px"]: + continue + x, y, w, h = cv2.boundingRect(contour) + if w < det["min_width_px"] or h < det["min_height_px"]: + continue + if w > det["max_width_px"] or h > det["max_height_px"]: + continue + aspect = float(w / max(h, 1)) + if not (det["min_aspect"] <= aspect <= det["max_aspect"]): + continue + + bbox_mask = mask[y:y + h, x:x + w] + blue_fill_ratio = float(np.count_nonzero(bbox_mask) / max(w * h, 1)) + if not (det["min_blue_fill_ratio"] <= blue_fill_ratio <= det["max_blue_fill_ratio"]): + continue + + rectangularity = float(area / max(w * h, 1)) + if rectangularity < det["min_rectangularity"]: + continue + + hull = cv2.convexHull(contour) + hull_area = float(cv2.contourArea(hull)) + solidity = float(area / hull_area) if hull_area > 0 else 0.0 + if solidity < det["min_solidity"]: + continue + + peri = float(cv2.arcLength(contour, True)) + approx = cv2.approxPolyDP(contour, 0.04 * peri, True) + fill_style = _classify_fill_style(blue_fill_ratio, rectangularity, solidity) + score = _score_candidate(blue_fill_ratio, rectangularity, solidity, aspect, len(approx)) + out.append( + Candidate( + sheet_id=sheet_id, + source_path=source_path, + x=int(x + xoff), + y=int(y + yoff), + w=int(w), + h=int(h), + cx=float(x + xoff + w / 2), + cy=float(y + yoff + h / 2), + area=area, + aspect=aspect, + blue_fill_ratio=blue_fill_ratio, + rectangularity=rectangularity, + solidity=solidity, + approx_vertices=int(len(approx)), + fill_style=fill_style, + score=score, + ) + ) + return out + + +def bbox_iou(a: Candidate, b: Candidate) -> float: + ax1, ay1, ax2, ay2 = a.x, a.y, a.x + a.w, a.y + a.h + bx1, by1, bx2, by2 = b.x, b.y, b.x + b.w, b.y + b.h + ix1, iy1 = max(ax1, bx1), max(ay1, by1) + ix2, iy2 = min(ax2, bx2), min(ay2, by2) + iw, ih = max(0, ix2 - ix1), max(0, iy2 - iy1) + inter = iw * ih + union = a.w * a.h + b.w * b.h - inter + return float(inter / union) if union else 0.0 + + +def nms(cands: Iterable[Candidate], threshold: float) -> List[Candidate]: + items = sorted(cands, key=lambda c: c.score, reverse=True) + kept: List[Candidate] = [] + for c in items: + if all(bbox_iou(c, k) < threshold for k in kept): + kept.append(c) + return kept + + +def detect_sheet(map_path: str | None, tif_path: str | None, sheet_id: str, cfg: Dict, out_dir: str | Path) -> Path: + out_dir = ensure_dir(out_dir) + rh = open_georaster(map_path=map_path, tif_path=tif_path) + try: + tile_size = int(cfg["detector"]["tile_size"]) + overlap = int(cfg["detector"]["tile_overlap"]) + all_candidates: List[Candidate] = [] + windows = list(iter_windows(rh.width, rh.height, tile_size, overlap)) + LOG.info("Scanning %s as %d windows (%dx%d px)", sheet_id, len(windows), rh.width, rh.height) + for win in tqdm(windows, desc=f"scan {sheet_id}"): + rgb = read_window_rgb(rh.dataset, win) + cands = find_candidates_in_rgb( + rgb=rgb, + cfg=cfg, + sheet_id=sheet_id, + source_path=str(rh.path), + xoff=int(win.col_off), + yoff=int(win.row_off), + ) + all_candidates.extend(cands) + kept = nms(all_candidates, float(cfg["detector"].get("nms_iou_threshold", 0.25))) + # Attach georeferenced centers when possible. + for c in kept: + if rh.crs is not None: + xgeo, ygeo = pixel_to_lonlat(rh.dataset, int(c.cy), int(c.cx)) + c.lon = xgeo + c.lat = ygeo + out_csv = out_dir / f"{sheet_id}_candidates.csv" + pd.DataFrame([c.to_dict() for c in kept]).to_csv(out_csv, index=False) + LOG.info("%s: wrote %d candidates to %s", sheet_id, len(kept), out_csv) + return out_csv + finally: + rh.close() + + +def draw_overlay(tif_path: str | Path, candidates_csv: str | Path, out_png: str | Path, max_side: int = 2400) -> Path: + """Draw candidates on a downscaled image. Uses TIFF path for simple visual QA.""" + img = Image.open(tif_path).convert("RGB") + scale = min(1.0, max_side / max(img.size)) + disp = img.resize((int(img.width * scale), int(img.height * scale))) if scale < 1 else img.copy() + draw = ImageDraw.Draw(disp) + df = pd.read_csv(candidates_csv) + color_by_style = { + "filled": (0, 255, 255), + "hollow": (0, 120, 255), + "border": (20, 20, 255), + "unknown": (255, 255, 0), + } + for _, r in df.iterrows(): + color = color_by_style.get(str(r.get("fill_style", "unknown")), (255, 255, 0)) + x1, y1 = float(r.x) * scale, float(r.y) * scale + x2, y2 = float(r.x + r.w) * scale, float(r.y + r.h) * scale + draw.rectangle([x1, y1, x2, y2], outline=color, width=max(1, int(2 * scale + 1))) + out_png = Path(out_png) + ensure_dir(out_png.parent) + disp.save(out_png) + LOG.info("Wrote overlay: %s", out_png) + return out_png diff --git a/bgtopo_poc/export_yolo.py b/bgtopo_poc/export_yolo.py new file mode 100644 index 0000000..820d446 --- /dev/null +++ b/bgtopo_poc/export_yolo.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import logging +import random +import shutil +from pathlib import Path +from typing import Dict + +import pandas as pd +from PIL import Image + +from .utils import ensure_dir + +LOG = logging.getLogger(__name__) + + +STYLE_TO_CLASS = { + "unknown": 0, + "filled": 1, + "hollow": 2, + "border": 3, +} + + +def _crop_image_and_labels(img: Image.Image, boxes: pd.DataFrame, x0: int, y0: int, size: int): + crop = img.crop((x0, y0, x0 + size, y0 + size)).convert("RGB") + labels = [] + for _, r in boxes.iterrows(): + bx1, by1, bx2, by2 = float(r.x), float(r.y), float(r.x + r.w), float(r.y + r.h) + ix1, iy1 = max(bx1, x0), max(by1, y0) + ix2, iy2 = min(bx2, x0 + size), min(by2, y0 + size) + if ix2 <= ix1 or iy2 <= iy1: + continue + visible_area = (ix2 - ix1) * (iy2 - iy1) + box_area = max((bx2 - bx1) * (by2 - by1), 1) + if visible_area / box_area < 0.35: + continue + cx = ((ix1 + ix2) / 2 - x0) / size + cy = ((iy1 + iy2) / 2 - y0) / size + w = (ix2 - ix1) / size + h = (iy2 - iy1) / size + cls = STYLE_TO_CLASS.get(str(r.get("fill_style", "unknown")), 0) + labels.append(f"{cls} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}") + return crop, labels + + +def export_candidates_to_yolo( + tif_path: str | Path, + candidates_csv: str | Path, + out_dir: str | Path, + cfg: Dict, + sheet_id: str, + tile_size: int = 1024, + overlap: int = 128, + val_fraction: float = 0.20, + include_empty_tiles: bool = True, + max_empty_tiles: int = 250, +) -> Path: + out_dir = Path(out_dir) + for split in ["train", "val"]: + ensure_dir(out_dir / "images" / split) + ensure_dir(out_dir / "labels" / split) + + img = Image.open(tif_path).convert("RGB") + boxes = pd.read_csv(candidates_csv) + step = max(1, tile_size - overlap) + random.seed(42) + empty_written = 0 + total_written = 0 + + for y0 in range(0, max(1, img.height - tile_size + 1), step): + for x0 in range(0, max(1, img.width - tile_size + 1), step): + in_tile = boxes[ + (boxes.cx >= x0) & (boxes.cx < x0 + tile_size) & + (boxes.cy >= y0) & (boxes.cy < y0 + tile_size) + ] + if in_tile.empty: + if not include_empty_tiles or empty_written >= max_empty_tiles: + continue + # Keep some empty/hard-negative tiles to stop the model from detecting all blue map details. + if random.random() > 0.08: + continue + empty_written += 1 + crop, labels = _crop_image_and_labels(img, boxes, x0, y0, tile_size) + split = "val" if random.random() < val_fraction else "train" + stem = f"{sheet_id}_{x0}_{y0}" + crop.save(out_dir / "images" / split / f"{stem}.jpg", quality=92) + with open(out_dir / "labels" / split / f"{stem}.txt", "w", encoding="utf-8") as f: + f.write("\n".join(labels)) + total_written += 1 + + data_yaml = out_dir / "data.yaml" + names = cfg.get("export", {}).get("yolo_class_names", ["blue_rect_unknown", "blue_rect_filled", "blue_rect_hollow", "blue_rect_border"]) + with open(data_yaml, "w", encoding="utf-8") as f: + f.write(f"path: {out_dir.resolve()}\n") + f.write("train: images/train\n") + f.write("val: images/val\n") + f.write("names:\n") + for i, name in enumerate(names): + f.write(f" {i}: {name}\n") + LOG.info("YOLO export complete: %s (%d tiles)", data_yaml, total_written) + return data_yaml + + +def merge_yolo_datasets(src_dirs: list[str | Path], out_dir: str | Path) -> Path: + out_dir = Path(out_dir) + for split in ["train", "val"]: + ensure_dir(out_dir / "images" / split) + ensure_dir(out_dir / "labels" / split) + for src in src_dirs: + src = Path(src) + for split in ["train", "val"]: + for img in (src / "images" / split).glob("*.jpg"): + shutil.copy2(img, out_dir / "images" / split / img.name) + for lab in (src / "labels" / split).glob("*.txt"): + shutil.copy2(lab, out_dir / "labels" / split / lab.name) + return out_dir diff --git a/bgtopo_poc/georef.py b/bgtopo_poc/georef.py new file mode 100644 index 0000000..83e2eed --- /dev/null +++ b/bgtopo_poc/georef.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Tuple + +import numpy as np +import rasterio +from rasterio.errors import RasterioIOError +from rasterio.transform import rowcol, xy +from rasterio.windows import Window + +LOG = logging.getLogger(__name__) + + +@dataclass +class RasterHandle: + path: Path + dataset: rasterio.io.DatasetReader + used_map_file: bool + + @property + def width(self) -> int: + return self.dataset.width + + @property + def height(self) -> int: + return self.dataset.height + + @property + def crs(self): + return self.dataset.crs + + @property + def transform(self): + return self.dataset.transform + + def close(self) -> None: + self.dataset.close() + + +def open_georaster(map_path: Optional[str | Path] = None, tif_path: Optional[str | Path] = None) -> RasterHandle: + """Open .map if possible, otherwise .tif. .map gives georeferencing through GDAL's MAP driver.""" + last_err = None + if map_path: + try: + p = Path(map_path) + ds = rasterio.open(p) + LOG.info("Opened georeferenced MAP dataset: %s", p) + return RasterHandle(p, ds, used_map_file=True) + except Exception as e: # noqa: BLE001 + last_err = e + LOG.warning("Could not open MAP dataset %s: %s", map_path, e) + if tif_path: + try: + p = Path(tif_path) + ds = rasterio.open(p) + LOG.info("Opened TIFF dataset: %s", p) + return RasterHandle(p, ds, used_map_file=False) + except RasterioIOError as e: + last_err = e + raise RuntimeError(f"Could not open raster. Last error: {last_err}") + + +def read_window_rgb(ds: rasterio.io.DatasetReader, window: Window) -> np.ndarray: + """Read a raster window as uint8 RGB HxWx3.""" + arr = ds.read(window=window, boundless=True, fill_value=255) + if arr.ndim != 3: + raise ValueError(f"Expected band-first array, got shape={arr.shape}") + if arr.shape[0] >= 3: + arr = arr[:3] + elif arr.shape[0] == 1: + arr = np.repeat(arr, 3, axis=0) + arr = np.moveaxis(arr, 0, -1) + if arr.dtype != np.uint8: + arr = np.clip(arr, 0, 255).astype(np.uint8) + return arr + + +def iter_windows(width: int, height: int, tile_size: int, overlap: int): + step = max(1, tile_size - overlap) + y = 0 + while y < height: + x = 0 + h = min(tile_size, height - y) + while x < width: + w = min(tile_size, width - x) + yield Window(x, y, w, h) + if x + tile_size >= width: + break + x += step + if y + tile_size >= height: + break + y += step + + +def lonlat_to_pixel(ds: rasterio.io.DatasetReader, lon: float, lat: float) -> Tuple[int, int]: + """Convert lon/lat to row/col. Assumes the raster CRS accepts lon/lat or GDAL handles geographic transform. + + For a production version, reproject coordinates into ds.crs with pyproj first. The PoC does that in coordinates.py. + """ + row, col = rowcol(ds.transform, lon, lat) + return int(row), int(col) + + +def pixel_to_lonlat(ds: rasterio.io.DatasetReader, row: int, col: int) -> Tuple[float, float]: + x, y = xy(ds.transform, row, col, offset="center") + return float(x), float(y) + + +def has_real_georef(ds: rasterio.io.DatasetReader) -> bool: + try: + return ds.crs is not None and not ds.transform.is_identity + except Exception: + return False diff --git a/bgtopo_poc/inventory.py b/bgtopo_poc/inventory.py new file mode 100644 index 0000000..115b520 --- /dev/null +++ b/bgtopo_poc/inventory.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Iterable, List, Optional +from urllib.parse import urljoin + +import pandas as pd +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + +from .utils import ensure_dir + +LOG = logging.getLogger(__name__) + + +@dataclass +class SheetAsset: + sheet_id: str + map_url: Optional[str] + tif_url: Optional[str] + map_path: Optional[str] = None + tif_path: Optional[str] = None + + def to_dict(self): + return asdict(self) + + +def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]: + """Discover .map/.tif pairs from the BGtopoVJ original raster directory listing.""" + LOG.info("Discovering assets from %s", base_url) + html = requests.get(base_url, timeout=60).text + soup = BeautifulSoup(html, "html.parser") + hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")] + + by_sheet: dict[str, SheetAsset] = {} + for href in hrefs: + if href.startswith("?") or href.startswith("/") or href == "../": + continue + if not include_100k and "100k" in href.lower(): + continue + if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")): + continue + sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE) + item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None)) + full_url = urljoin(base_url, href) + if href.lower().endswith(".map"): + item.map_url = full_url + else: + item.tif_url = full_url + + assets = [v for v in by_sheet.values() if v.map_url and v.tif_url] + assets.sort(key=lambda x: x.sheet_id) + LOG.info("Discovered %d complete .map/.tif pairs", len(assets)) + return assets + + +def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path: + rows = [a.to_dict() for a in assets] + out_csv = Path(out_csv) + ensure_dir(out_csv.parent) + pd.DataFrame(rows).to_csv(out_csv, index=False) + LOG.info("Wrote manifest: %s", out_csv) + return out_csv + + +def read_manifest_csv(path: str | Path) -> List[SheetAsset]: + df = pd.read_csv(path).fillna("") + assets: List[SheetAsset] = [] + for _, r in df.iterrows(): + assets.append( + SheetAsset( + sheet_id=str(r["sheet_id"]), + map_url=str(r.get("map_url") or "") or None, + tif_url=str(r.get("tif_url") or "") or None, + map_path=str(r.get("map_path") or "") or None, + tif_path=str(r.get("tif_path") or "") or None, + ) + ) + return assets + + +def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path: + if out_path.exists() and out_path.stat().st_size > 0 and not overwrite: + return out_path + ensure_dir(out_path.parent) + with requests.get(url, stream=True, timeout=120) as r: + r.raise_for_status() + total = int(r.headers.get("content-length", "0") or 0) + with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar: + for chunk in r.iter_content(chunk_size=1024 * 512): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + return out_path + + +def download_assets( + assets: List[SheetAsset], + out_dir: str | Path, + limit: Optional[int] = None, + overwrite: bool = False, +) -> List[SheetAsset]: + out_dir = Path(out_dir) + selected = assets[:limit] if limit else assets + for item in selected: + sheet_dir = out_dir / item.sheet_id + if item.map_url: + item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite)) + if item.tif_url: + item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite)) + return selected diff --git a/bgtopo_poc/report.py b/bgtopo_poc/report.py new file mode 100644 index 0000000..5d49b8b --- /dev/null +++ b/bgtopo_poc/report.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd +from jinja2 import Template + +from .utils import ensure_dir + +REPORT_TEMPLATE = """ + + +
+ +This is a weak-label mining report. Treat candidates as review targets, not truth.
+ +| Metric | Value |
|---|---|
| Total candidates | {{ total }} |
| Average score | {{ avg_score }} |
| Median score | {{ med_score }} |
{{ overlay.name }}
No candidates.
" + if not df.empty: + style_table = df.groupby("fill_style").agg(count=("fill_style", "size"), avg_score=("score", "mean")).reset_index().to_html(index=False) + overlay_items = [] + for ov in overlays: + ov = Path(ov) + try: + rel = ov.relative_to(out_html.parent) + except ValueError: + rel = ov + overlay_items.append({"name": ov.name, "rel": str(rel).replace("\\", "/")}) + html = Template(REPORT_TEMPLATE).render( + total=0 if df.empty else len(df), + avg_score="—" if df.empty else f"{df['score'].mean():.3f}", + med_score="—" if df.empty else f"{df['score'].median():.3f}", + style_table=style_table, + overlays=overlay_items, + ) + out_html.write_text(html, encoding="utf-8") + return out_html diff --git a/bgtopo_poc/train_yolo.py b/bgtopo_poc/train_yolo.py new file mode 100644 index 0000000..98f7f95 --- /dev/null +++ b/bgtopo_poc/train_yolo.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import logging +from pathlib import Path + +LOG = logging.getLogger(__name__) + + +def train_yolo(data_yaml: str | Path, model: str = "yolov8s.pt", imgsz: int = 1024, epochs: int = 80, batch: int = 4, device: str = "0"): + """Train YOLO on the generated weak-label dataset. + + This function imports ultralytics lazily so the rest of the PoC works without GPU dependencies. + Review/correct the weak labels before treating this model as useful. + """ + from ultralytics import YOLO + + yolo = YOLO(model) + LOG.info("Starting YOLO training: model=%s data=%s imgsz=%d epochs=%d batch=%d device=%s", model, data_yaml, imgsz, epochs, batch, device) + return yolo.train( + data=str(data_yaml), + imgsz=imgsz, + epochs=epochs, + batch=batch, + device=device, + workers=4, + cache=False, + patience=20, + project="runs/bgtopo_bluebox", + name=f"{Path(data_yaml).parent.name}_{Path(model).stem}", + hsv_h=0.005, + hsv_s=0.20, + hsv_v=0.18, + degrees=0.0, + translate=0.05, + scale=0.20, + fliplr=0.5, + flipud=0.5, + mosaic=0.25, + close_mosaic=15, + ) diff --git a/bgtopo_poc/utils.py b/bgtopo_poc/utils.py new file mode 100644 index 0000000..797d798 --- /dev/null +++ b/bgtopo_poc/utils.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import Any, Dict + +import yaml + + +def setup_logging(verbose: bool = False) -> None: + logging.basicConfig( + level=logging.DEBUG if verbose else logging.INFO, + format="%(asctime)s | %(levelname)-8s | %(message)s", + datefmt="%H:%M:%S", + ) + + +def load_yaml(path: str | Path) -> Dict[str, Any]: + with open(path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + + +def ensure_dir(path: str | Path) -> Path: + p = Path(path) + p.mkdir(parents=True, exist_ok=True) + return p + + +def write_json(path: str | Path, data: Any) -> None: + p = Path(path) + ensure_dir(p.parent) + with open(p, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + +def read_json(path: str | Path) -> Any: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def safe_stem(name: str) -> str: + return Path(name).stem.replace("/", "_").replace("\\", "_") diff --git a/configs/blue_detector.yaml b/configs/blue_detector.yaml new file mode 100644 index 0000000..478e552 --- /dev/null +++ b/configs/blue_detector.yaml @@ -0,0 +1,47 @@ +# Tuned to be permissive. Expect false positives from rivers, lakes, names and contours. +# The purpose is weak-label mining + hard-negative discovery, not final truth. +source: + base_url: "https://web.uni-plovdiv.net/vedrin/raster/original/" + default_crs: "EPSG:4326" + +detector: + tile_size: 1400 + tile_overlap: 96 + min_width_px: 5 + min_height_px: 5 + max_width_px: 95 + max_height_px: 95 + min_area_px: 18 + max_area_px: 6500 + min_aspect: 0.35 + max_aspect: 4.25 + min_blue_fill_ratio: 0.045 + max_blue_fill_ratio: 0.92 + min_rectangularity: 0.18 + min_solidity: 0.22 + nms_iou_threshold: 0.25 + # OpenCV HSV hue range is 0-179, not 0-360. + hsv_ranges: + - name: blue_dark_to_light + lower: [82, 28, 45] + upper: [137, 255, 255] + - name: pale_cyan_low_sat + lower: [78, 10, 105] + upper: [115, 115, 255] + morphology: + open_kernel: 2 + close_kernel: 3 + dilate_iterations: 1 + +coordinate_scoring: + search_radius_px: 45 + strong_score: 0.90 + weak_score: 0.40 + crop_size_px: 256 + +export: + yolo_class_names: + - blue_rect_unknown + - blue_rect_filled + - blue_rect_hollow + - blue_rect_border diff --git a/data/coordinates/example_points.csv b/data/coordinates/example_points.csv new file mode 100644 index 0000000..6f01c56 --- /dev/null +++ b/data/coordinates/example_points.csv @@ -0,0 +1,3 @@ +id,lat,lon,expected +example_001,42.58837223,23.19638729,unknown +example_002,42.70000000,23.32000000,unknown diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml new file mode 100644 index 0000000..d383298 --- /dev/null +++ b/docker-compose.gpu.yml @@ -0,0 +1,18 @@ +services: + bgtopo-bluebox: + build: + context: . + dockerfile: docker/Dockerfile.gpu + image: bgtopo-bluebox-poc:gpu + working_dir: /app + volumes: + - .:/app + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + stdin_open: true + tty: true diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu new file mode 100644 index 0000000..29b7c00 --- /dev/null +++ b/docker/Dockerfile.gpu @@ -0,0 +1,19 @@ +FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv git wget curl ca-certificates \ + gdal-bin libgdal-dev python3-gdal libgl1 libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY requirements.txt /app/requirements.txt +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install --no-cache-dir -r requirements.txt + +COPY . /app +ENV PYTHONPATH=/app +CMD ["bash"] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f25f8c4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "bgtopo-bluebox-poc" +version = "0.1.0" +description = "PoC pipeline for detecting blue square/rectangle symbols in BGtopoVJ rasters and scoring coordinate points." +requires-python = ">=3.10" +dependencies = [] + +[tool.setuptools] +packages = ["bgtopo_poc"] + +[project.scripts] +bgtopo-bluebox = "bgtopo_poc.cli:main" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dd9a356 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +# Core GIS / raster stack +rasterio>=1.3.9 +pyproj>=3.6.1 +shapely>=2.0.2 +geopandas>=0.14.4 +fiona>=1.9.6 + +# CV / ML +opencv-python>=4.9.0 +numpy>=1.26.4 +pandas>=2.2.1 +Pillow>=10.2.0 +scikit-learn>=1.4.1 +ultralytics>=8.2.0 +PyYAML>=6.0.1 +tqdm>=4.66.2 +requests>=2.31.0 +beautifulsoup4>=4.12.3 + +# Reports / QA +matplotlib>=3.8.3 +jinja2>=3.1.3 diff --git a/scripts/run_pilot.sh b/scripts/run_pilot.sh new file mode 100644 index 0000000..7ba9f80 --- /dev/null +++ b/scripts/run_pilot.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Run from repository root. +# First run downloads two sheets only. Increase --limit after confirming overlays look sane. + +python -m bgtopo_poc.cli inventory \ + --config configs/blue_detector.yaml \ + --out data/manifest.csv \ + --limit 2 + +python -m bgtopo_poc.cli download \ + --manifest data/manifest.csv \ + --out-dir data/raw \ + --out-manifest data/manifest_downloaded.csv \ + --limit 2 + +python - <<'PY' +import pandas as pd +from pathlib import Path +m = pd.read_csv('data/manifest_downloaded.csv') +for _, r in m.iterrows(): + sid = r['sheet_id'] + Path('data/interim/candidates').mkdir(parents=True, exist_ok=True) + print(f"Scanning {sid}") +PY + +while IFS=, read -r sheet_id map_url tif_url map_path tif_path; do + if [[ "$sheet_id" == "sheet_id" ]]; then continue; fi + python -m bgtopo_poc.cli detect \ + --config configs/blue_detector.yaml \ + --sheet-id "$sheet_id" \ + --map "$map_path" \ + --tif "$tif_path" \ + --out-dir data/interim/candidates + + python -m bgtopo_poc.cli overlay \ + --tif "$tif_path" \ + --candidates "data/interim/candidates/${sheet_id}_candidates.csv" \ + --out "reports/overlays/${sheet_id}_overlay.png" + +done < data/manifest_downloaded.csv + +python -m bgtopo_poc.cli report \ + --candidates data/interim/candidates/*_candidates.csv \ + --overlays reports/overlays/*_overlay.png \ + --out reports/poc_report.html + +echo "Open reports/poc_report.html" diff --git a/scripts/run_training_example.sh b/scripts/run_training_example.sh new file mode 100644 index 0000000..ec88a4f --- /dev/null +++ b/scripts/run_training_example.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Pick one downloaded sheet/candidate file first. +SHEET_ID="${1:-K-34-009-2}" +TIF="data/raw/${SHEET_ID}/${SHEET_ID}.tif" +CAND="data/interim/candidates/${SHEET_ID}_candidates.csv" +OUT="data/yolo/${SHEET_ID}" + +python -m bgtopo_poc.cli export-yolo \ + --config configs/blue_detector.yaml \ + --sheet-id "$SHEET_ID" \ + --tif "$TIF" \ + --candidates "$CAND" \ + --out-dir "$OUT" \ + --tile-size 1024 \ + --overlap 128 + +# RTX 3080 FE 10 GB: start batch 2-4 at imgsz=1024. Raise only if VRAM allows it. +python -m bgtopo_poc.cli train-yolo \ + --data-yaml "$OUT/data.yaml" \ + --model yolov8s.pt \ + --imgsz 1024 \ + --epochs 80 \ + --batch 4 \ + --device 0