This commit is contained in:
nq
2026-05-03 21:58:47 +03:00
parent fd44235ff4
commit e45d1cb6b4
18 changed files with 1259 additions and 0 deletions

1
bgtopo_poc/__init__.py Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.1.0"

178
bgtopo_poc/cli.py Normal file
View File

@@ -0,0 +1,178 @@
from __future__ import annotations
import argparse
import logging
from pathlib import Path
from .coordinates import extract_coordinate_crops, score_coordinates_for_sheet
from .detector_cv import detect_sheet, draw_overlay
from .export_yolo import export_candidates_to_yolo
from .inventory import discover_original_assets, download_assets, read_manifest_csv, write_manifest_csv
from .report import build_report
from .train_yolo import train_yolo
from .utils import load_yaml, setup_logging
LOG = logging.getLogger(__name__)
def cmd_inventory(args):
cfg = load_yaml(args.config)
base_url = args.base_url or cfg["source"]["base_url"]
assets = discover_original_assets(base_url=base_url, include_100k=args.include_100k)
if args.limit:
assets = assets[: args.limit]
write_manifest_csv(assets, args.out)
def cmd_download(args):
assets = read_manifest_csv(args.manifest)
selected = download_assets(assets, args.out_dir, limit=args.limit, overwrite=args.overwrite)
write_manifest_csv(selected, args.out_manifest)
def cmd_detect(args):
cfg = load_yaml(args.config)
detect_sheet(args.map, args.tif, args.sheet_id, cfg, args.out_dir)
def cmd_overlay(args):
draw_overlay(args.tif, args.candidates, args.out)
def cmd_score_coords(args):
cfg = load_yaml(args.config)
score_coordinates_for_sheet(
coord_csv=args.coordinates,
candidates_csv=args.candidates,
map_path=args.map,
tif_path=args.tif,
sheet_id=args.sheet_id,
cfg=cfg,
out_dir=args.out_dir,
coord_crs=args.coord_crs,
)
def cmd_crops(args):
extract_coordinate_crops(
coord_scores_csv=args.scores,
map_path=args.map,
tif_path=args.tif,
out_dir=args.out_dir,
crop_size=args.crop_size,
)
def cmd_export_yolo(args):
cfg = load_yaml(args.config)
export_candidates_to_yolo(
tif_path=args.tif,
candidates_csv=args.candidates,
out_dir=args.out_dir,
cfg=cfg,
sheet_id=args.sheet_id,
tile_size=args.tile_size,
overlap=args.overlap,
val_fraction=args.val_fraction,
)
def cmd_train_yolo(args):
train_yolo(args.data_yaml, model=args.model, imgsz=args.imgsz, epochs=args.epochs, batch=args.batch, device=args.device)
def cmd_report(args):
build_report([Path(p) for p in args.candidates], [Path(p) for p in args.overlays], args.out)
def build_parser():
p = argparse.ArgumentParser(prog="bgtopo-bluebox", description="BGtopoVJ blue rectangle/square PoC pipeline")
p.add_argument("--verbose", action="store_true")
sub = p.add_subparsers(dest="cmd", required=True)
s = sub.add_parser("inventory", help="Crawl original raster directory and create manifest")
s.add_argument("--config", default="configs/blue_detector.yaml")
s.add_argument("--base-url", default=None)
s.add_argument("--out", default="data/manifest.csv")
s.add_argument("--limit", type=int, default=None)
s.add_argument("--include-100k", action="store_true")
s.set_defaults(func=cmd_inventory)
s = sub.add_parser("download", help="Download .map/.tif pairs from manifest")
s.add_argument("--manifest", default="data/manifest.csv")
s.add_argument("--out-dir", default="data/raw")
s.add_argument("--out-manifest", default="data/manifest_downloaded.csv")
s.add_argument("--limit", type=int, default=2)
s.add_argument("--overwrite", action="store_true")
s.set_defaults(func=cmd_download)
s = sub.add_parser("detect", help="Detect blue rectangle/square candidates on one sheet")
s.add_argument("--config", default="configs/blue_detector.yaml")
s.add_argument("--sheet-id", required=True)
s.add_argument("--map", default=None)
s.add_argument("--tif", required=True)
s.add_argument("--out-dir", default="data/interim/candidates")
s.set_defaults(func=cmd_detect)
s = sub.add_parser("overlay", help="Draw candidate overlay for manual QA")
s.add_argument("--tif", required=True)
s.add_argument("--candidates", required=True)
s.add_argument("--out", required=True)
s.set_defaults(func=cmd_overlay)
s = sub.add_parser("score-coords", help="Score coordinate CSV against candidates from one sheet")
s.add_argument("--config", default="configs/blue_detector.yaml")
s.add_argument("--sheet-id", required=True)
s.add_argument("--coordinates", required=True)
s.add_argument("--candidates", required=True)
s.add_argument("--map", default=None)
s.add_argument("--tif", required=True)
s.add_argument("--out-dir", default="data/interim/coordinate_scores")
s.add_argument("--coord-crs", default="EPSG:4326")
s.set_defaults(func=cmd_score_coords)
s = sub.add_parser("crops", help="Extract review crops around scored coordinates")
s.add_argument("--scores", required=True)
s.add_argument("--map", default=None)
s.add_argument("--tif", required=True)
s.add_argument("--out-dir", default="data/interim/crops")
s.add_argument("--crop-size", type=int, default=256)
s.set_defaults(func=cmd_crops)
s = sub.add_parser("export-yolo", help="Export weak candidates to YOLO dataset format")
s.add_argument("--config", default="configs/blue_detector.yaml")
s.add_argument("--sheet-id", required=True)
s.add_argument("--tif", required=True)
s.add_argument("--candidates", required=True)
s.add_argument("--out-dir", default="data/yolo/bluebox")
s.add_argument("--tile-size", type=int, default=1024)
s.add_argument("--overlap", type=int, default=128)
s.add_argument("--val-fraction", type=float, default=0.20)
s.set_defaults(func=cmd_export_yolo)
s = sub.add_parser("train-yolo", help="Train YOLO on exported dataset")
s.add_argument("--data-yaml", required=True)
s.add_argument("--model", default="yolov8s.pt")
s.add_argument("--imgsz", type=int, default=1024)
s.add_argument("--epochs", type=int, default=80)
s.add_argument("--batch", type=int, default=4)
s.add_argument("--device", default="0")
s.set_defaults(func=cmd_train_yolo)
s = sub.add_parser("report", help="Build HTML QA report")
s.add_argument("--candidates", nargs="+", required=True)
s.add_argument("--overlays", nargs="*", default=[])
s.add_argument("--out", default="reports/poc_report.html")
s.set_defaults(func=cmd_report)
return p
def main():
parser = build_parser()
args = parser.parse_args()
setup_logging(args.verbose)
args.func(args)
if __name__ == "__main__":
main()

150
bgtopo_poc/coordinates.py Normal file
View File

@@ -0,0 +1,150 @@
from __future__ import annotations
import logging
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
from pyproj import Transformer
from rasterio.windows import Window
from .georef import open_georaster, read_window_rgb
from .utils import ensure_dir
LOG = logging.getLogger(__name__)
def _normalize_coordinate_columns(df: pd.DataFrame) -> pd.DataFrame:
cols = {c.lower().strip(): c for c in df.columns}
lat_col = cols.get("lat") or cols.get("latitude") or cols.get("y")
lon_col = cols.get("lon") or cols.get("lng") or cols.get("longitude") or cols.get("x")
if not lat_col or not lon_col:
raise ValueError("Coordinate CSV needs lat/lon columns, or latitude/longitude, or y/x.")
out = df.copy()
out["lat"] = pd.to_numeric(out[lat_col], errors="coerce")
out["lon"] = pd.to_numeric(out[lon_col], errors="coerce")
if "id" not in out.columns:
out["id"] = [f"pt_{i:06d}" for i in range(len(out))]
return out.dropna(subset=["lat", "lon"])
def load_coordinates(path: str | Path) -> pd.DataFrame:
return _normalize_coordinate_columns(pd.read_csv(path))
def coord_to_rowcol(ds, lon: float, lat: float, coord_crs: str = "EPSG:4326") -> Optional[tuple[int, int]]:
if ds.crs is None:
return None
try:
transformer = Transformer.from_crs(coord_crs, ds.crs, always_xy=True)
x, y = transformer.transform(lon, lat)
row, col = ds.index(x, y)
return int(row), int(col)
except Exception as e: # noqa: BLE001
LOG.debug("coord_to_rowcol failed: %s", e)
return None
def score_coordinates_for_sheet(
coord_csv: str | Path,
candidates_csv: str | Path,
map_path: str | None,
tif_path: str | None,
sheet_id: str,
cfg: Dict,
out_dir: str | Path,
coord_crs: str = "EPSG:4326",
) -> Path:
out_dir = ensure_dir(out_dir)
coords = load_coordinates(coord_csv)
cands = pd.read_csv(candidates_csv)
rh = open_georaster(map_path=map_path, tif_path=tif_path)
radius = float(cfg["coordinate_scoring"].get("search_radius_px", 45))
try:
rows: List[dict] = []
for _, pt in coords.iterrows():
rc = coord_to_rowcol(rh.dataset, float(pt.lon), float(pt.lat), coord_crs=coord_crs)
if rc is None:
continue
row, col = rc
if row < 0 or col < 0 or row >= rh.height or col >= rh.width:
continue
if cands.empty:
nearest = None
else:
dx = cands["cx"].astype(float).to_numpy() - col
dy = cands["cy"].astype(float).to_numpy() - row
dist = np.sqrt(dx * dx + dy * dy)
i = int(np.argmin(dist))
nearest = (i, float(dist[i]))
score = 0.0
nearest_id = None
nearest_dist = None
nearest_style = None
nearest_det_score = None
decision = "auto_negative"
if nearest:
i, d = nearest
nearest_dist = d
if d <= radius:
nearest_id = i
nearest_style = str(cands.iloc[i].get("fill_style", "unknown"))
nearest_det_score = float(cands.iloc[i].get("score", 0.0))
dist_factor = max(0.0, 1.0 - d / radius)
score = float(0.55 * nearest_det_score + 0.45 * dist_factor)
if score >= float(cfg["coordinate_scoring"].get("strong_score", 0.90)):
decision = "auto_positive"
elif score >= float(cfg["coordinate_scoring"].get("weak_score", 0.40)):
decision = "review"
rows.append({
"id": pt.id,
"sheet_id": sheet_id,
"lat": float(pt.lat),
"lon": float(pt.lon),
"row": row,
"col": col,
"nearest_candidate_index": nearest_id,
"nearest_candidate_distance_px": nearest_dist,
"nearest_candidate_style": nearest_style,
"nearest_candidate_score": nearest_det_score,
"coordinate_score": score,
"decision": decision,
})
out_csv = Path(out_dir) / f"{sheet_id}_coordinate_scores.csv"
pd.DataFrame(rows).to_csv(out_csv, index=False)
LOG.info("Wrote coordinate scores: %s", out_csv)
return out_csv
finally:
rh.close()
def extract_coordinate_crops(
coord_scores_csv: str | Path,
map_path: str | None,
tif_path: str | None,
out_dir: str | Path,
crop_size: int = 256,
only_decisions: tuple[str, ...] = ("review", "auto_positive"),
) -> Path:
out_dir = ensure_dir(out_dir)
df = pd.read_csv(coord_scores_csv)
rh = open_georaster(map_path=map_path, tif_path=tif_path)
half = crop_size // 2
try:
for _, r in df.iterrows():
if str(r.decision) not in only_decisions:
continue
row, col = int(r.row), int(r.col)
win = Window(col - half, row - half, crop_size, crop_size)
rgb = read_window_rgb(rh.dataset, win)
img = Image.fromarray(rgb).convert("RGB")
draw = ImageDraw.Draw(img)
draw.ellipse([half - 5, half - 5, half + 5, half + 5], outline=(255, 0, 0), width=2)
name = f"{str(r.id)}__{str(r.decision)}__score_{float(r.coordinate_score):.3f}.png"
img.save(Path(out_dir) / name)
LOG.info("Wrote crops into: %s", out_dir)
return Path(out_dir)
finally:
rh.close()

223
bgtopo_poc/detector_cv.py Normal file
View File

@@ -0,0 +1,223 @@
from __future__ import annotations
import logging
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
import cv2
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
from rasterio.windows import Window
from tqdm import tqdm
from .georef import iter_windows, open_georaster, pixel_to_lonlat, read_window_rgb
from .utils import ensure_dir, safe_stem
LOG = logging.getLogger(__name__)
@dataclass
class Candidate:
sheet_id: str
source_path: str
x: int
y: int
w: int
h: int
cx: float
cy: float
area: float
aspect: float
blue_fill_ratio: float
rectangularity: float
solidity: float
approx_vertices: int
fill_style: str
score: float
lon: float | None = None
lat: float | None = None
def to_dict(self):
return asdict(self)
def build_blue_mask(rgb: np.ndarray, cfg: Dict) -> np.ndarray:
hsv = cv2.cvtColor(rgb, cv2.COLOR_RGB2HSV)
mask = np.zeros(hsv.shape[:2], dtype=np.uint8)
for r in cfg["detector"].get("hsv_ranges", []):
lower = np.array(r["lower"], dtype=np.uint8)
upper = np.array(r["upper"], dtype=np.uint8)
mask |= cv2.inRange(hsv, lower, upper)
morph = cfg["detector"].get("morphology", {})
open_k = int(morph.get("open_kernel", 0) or 0)
close_k = int(morph.get("close_kernel", 0) or 0)
if open_k > 1:
k = np.ones((open_k, open_k), np.uint8)
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, k)
if close_k > 1:
k = np.ones((close_k, close_k), np.uint8)
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, k)
dilate_iter = int(morph.get("dilate_iterations", 0) or 0)
if dilate_iter > 0:
mask = cv2.dilate(mask, np.ones((2, 2), np.uint8), iterations=dilate_iter)
return mask
def _classify_fill_style(blue_fill_ratio: float, rectangularity: float, solidity: float) -> str:
if blue_fill_ratio >= 0.48 and solidity >= 0.60:
return "filled"
if 0.10 <= blue_fill_ratio < 0.42 and rectangularity >= 0.28:
return "hollow"
if blue_fill_ratio < 0.18 and rectangularity >= 0.18:
return "border"
return "unknown"
def _score_candidate(blue_fill_ratio: float, rectangularity: float, solidity: float, aspect: float, approx_vertices: int) -> float:
aspect_bonus = 1.0 - min(abs(np.log(max(aspect, 1e-3))), 1.8) / 1.8
vertex_bonus = 1.0 if 4 <= approx_vertices <= 8 else 0.55
raw = 0.30 * blue_fill_ratio + 0.28 * rectangularity + 0.22 * solidity + 0.12 * aspect_bonus + 0.08 * vertex_bonus
return float(max(0.0, min(1.0, raw)))
def find_candidates_in_rgb(rgb: np.ndarray, cfg: Dict, sheet_id: str, source_path: str, xoff: int = 0, yoff: int = 0) -> List[Candidate]:
det = cfg["detector"]
mask = build_blue_mask(rgb, cfg)
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
out: List[Candidate] = []
for contour in contours:
area = float(cv2.contourArea(contour))
if area < det["min_area_px"] or area > det["max_area_px"]:
continue
x, y, w, h = cv2.boundingRect(contour)
if w < det["min_width_px"] or h < det["min_height_px"]:
continue
if w > det["max_width_px"] or h > det["max_height_px"]:
continue
aspect = float(w / max(h, 1))
if not (det["min_aspect"] <= aspect <= det["max_aspect"]):
continue
bbox_mask = mask[y:y + h, x:x + w]
blue_fill_ratio = float(np.count_nonzero(bbox_mask) / max(w * h, 1))
if not (det["min_blue_fill_ratio"] <= blue_fill_ratio <= det["max_blue_fill_ratio"]):
continue
rectangularity = float(area / max(w * h, 1))
if rectangularity < det["min_rectangularity"]:
continue
hull = cv2.convexHull(contour)
hull_area = float(cv2.contourArea(hull))
solidity = float(area / hull_area) if hull_area > 0 else 0.0
if solidity < det["min_solidity"]:
continue
peri = float(cv2.arcLength(contour, True))
approx = cv2.approxPolyDP(contour, 0.04 * peri, True)
fill_style = _classify_fill_style(blue_fill_ratio, rectangularity, solidity)
score = _score_candidate(blue_fill_ratio, rectangularity, solidity, aspect, len(approx))
out.append(
Candidate(
sheet_id=sheet_id,
source_path=source_path,
x=int(x + xoff),
y=int(y + yoff),
w=int(w),
h=int(h),
cx=float(x + xoff + w / 2),
cy=float(y + yoff + h / 2),
area=area,
aspect=aspect,
blue_fill_ratio=blue_fill_ratio,
rectangularity=rectangularity,
solidity=solidity,
approx_vertices=int(len(approx)),
fill_style=fill_style,
score=score,
)
)
return out
def bbox_iou(a: Candidate, b: Candidate) -> float:
ax1, ay1, ax2, ay2 = a.x, a.y, a.x + a.w, a.y + a.h
bx1, by1, bx2, by2 = b.x, b.y, b.x + b.w, b.y + b.h
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
iw, ih = max(0, ix2 - ix1), max(0, iy2 - iy1)
inter = iw * ih
union = a.w * a.h + b.w * b.h - inter
return float(inter / union) if union else 0.0
def nms(cands: Iterable[Candidate], threshold: float) -> List[Candidate]:
items = sorted(cands, key=lambda c: c.score, reverse=True)
kept: List[Candidate] = []
for c in items:
if all(bbox_iou(c, k) < threshold for k in kept):
kept.append(c)
return kept
def detect_sheet(map_path: str | None, tif_path: str | None, sheet_id: str, cfg: Dict, out_dir: str | Path) -> Path:
out_dir = ensure_dir(out_dir)
rh = open_georaster(map_path=map_path, tif_path=tif_path)
try:
tile_size = int(cfg["detector"]["tile_size"])
overlap = int(cfg["detector"]["tile_overlap"])
all_candidates: List[Candidate] = []
windows = list(iter_windows(rh.width, rh.height, tile_size, overlap))
LOG.info("Scanning %s as %d windows (%dx%d px)", sheet_id, len(windows), rh.width, rh.height)
for win in tqdm(windows, desc=f"scan {sheet_id}"):
rgb = read_window_rgb(rh.dataset, win)
cands = find_candidates_in_rgb(
rgb=rgb,
cfg=cfg,
sheet_id=sheet_id,
source_path=str(rh.path),
xoff=int(win.col_off),
yoff=int(win.row_off),
)
all_candidates.extend(cands)
kept = nms(all_candidates, float(cfg["detector"].get("nms_iou_threshold", 0.25)))
# Attach georeferenced centers when possible.
for c in kept:
if rh.crs is not None:
xgeo, ygeo = pixel_to_lonlat(rh.dataset, int(c.cy), int(c.cx))
c.lon = xgeo
c.lat = ygeo
out_csv = out_dir / f"{sheet_id}_candidates.csv"
pd.DataFrame([c.to_dict() for c in kept]).to_csv(out_csv, index=False)
LOG.info("%s: wrote %d candidates to %s", sheet_id, len(kept), out_csv)
return out_csv
finally:
rh.close()
def draw_overlay(tif_path: str | Path, candidates_csv: str | Path, out_png: str | Path, max_side: int = 2400) -> Path:
"""Draw candidates on a downscaled image. Uses TIFF path for simple visual QA."""
img = Image.open(tif_path).convert("RGB")
scale = min(1.0, max_side / max(img.size))
disp = img.resize((int(img.width * scale), int(img.height * scale))) if scale < 1 else img.copy()
draw = ImageDraw.Draw(disp)
df = pd.read_csv(candidates_csv)
color_by_style = {
"filled": (0, 255, 255),
"hollow": (0, 120, 255),
"border": (20, 20, 255),
"unknown": (255, 255, 0),
}
for _, r in df.iterrows():
color = color_by_style.get(str(r.get("fill_style", "unknown")), (255, 255, 0))
x1, y1 = float(r.x) * scale, float(r.y) * scale
x2, y2 = float(r.x + r.w) * scale, float(r.y + r.h) * scale
draw.rectangle([x1, y1, x2, y2], outline=color, width=max(1, int(2 * scale + 1)))
out_png = Path(out_png)
ensure_dir(out_png.parent)
disp.save(out_png)
LOG.info("Wrote overlay: %s", out_png)
return out_png

117
bgtopo_poc/export_yolo.py Normal file
View File

@@ -0,0 +1,117 @@
from __future__ import annotations
import logging
import random
import shutil
from pathlib import Path
from typing import Dict
import pandas as pd
from PIL import Image
from .utils import ensure_dir
LOG = logging.getLogger(__name__)
STYLE_TO_CLASS = {
"unknown": 0,
"filled": 1,
"hollow": 2,
"border": 3,
}
def _crop_image_and_labels(img: Image.Image, boxes: pd.DataFrame, x0: int, y0: int, size: int):
crop = img.crop((x0, y0, x0 + size, y0 + size)).convert("RGB")
labels = []
for _, r in boxes.iterrows():
bx1, by1, bx2, by2 = float(r.x), float(r.y), float(r.x + r.w), float(r.y + r.h)
ix1, iy1 = max(bx1, x0), max(by1, y0)
ix2, iy2 = min(bx2, x0 + size), min(by2, y0 + size)
if ix2 <= ix1 or iy2 <= iy1:
continue
visible_area = (ix2 - ix1) * (iy2 - iy1)
box_area = max((bx2 - bx1) * (by2 - by1), 1)
if visible_area / box_area < 0.35:
continue
cx = ((ix1 + ix2) / 2 - x0) / size
cy = ((iy1 + iy2) / 2 - y0) / size
w = (ix2 - ix1) / size
h = (iy2 - iy1) / size
cls = STYLE_TO_CLASS.get(str(r.get("fill_style", "unknown")), 0)
labels.append(f"{cls} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}")
return crop, labels
def export_candidates_to_yolo(
tif_path: str | Path,
candidates_csv: str | Path,
out_dir: str | Path,
cfg: Dict,
sheet_id: str,
tile_size: int = 1024,
overlap: int = 128,
val_fraction: float = 0.20,
include_empty_tiles: bool = True,
max_empty_tiles: int = 250,
) -> Path:
out_dir = Path(out_dir)
for split in ["train", "val"]:
ensure_dir(out_dir / "images" / split)
ensure_dir(out_dir / "labels" / split)
img = Image.open(tif_path).convert("RGB")
boxes = pd.read_csv(candidates_csv)
step = max(1, tile_size - overlap)
random.seed(42)
empty_written = 0
total_written = 0
for y0 in range(0, max(1, img.height - tile_size + 1), step):
for x0 in range(0, max(1, img.width - tile_size + 1), step):
in_tile = boxes[
(boxes.cx >= x0) & (boxes.cx < x0 + tile_size) &
(boxes.cy >= y0) & (boxes.cy < y0 + tile_size)
]
if in_tile.empty:
if not include_empty_tiles or empty_written >= max_empty_tiles:
continue
# Keep some empty/hard-negative tiles to stop the model from detecting all blue map details.
if random.random() > 0.08:
continue
empty_written += 1
crop, labels = _crop_image_and_labels(img, boxes, x0, y0, tile_size)
split = "val" if random.random() < val_fraction else "train"
stem = f"{sheet_id}_{x0}_{y0}"
crop.save(out_dir / "images" / split / f"{stem}.jpg", quality=92)
with open(out_dir / "labels" / split / f"{stem}.txt", "w", encoding="utf-8") as f:
f.write("\n".join(labels))
total_written += 1
data_yaml = out_dir / "data.yaml"
names = cfg.get("export", {}).get("yolo_class_names", ["blue_rect_unknown", "blue_rect_filled", "blue_rect_hollow", "blue_rect_border"])
with open(data_yaml, "w", encoding="utf-8") as f:
f.write(f"path: {out_dir.resolve()}\n")
f.write("train: images/train\n")
f.write("val: images/val\n")
f.write("names:\n")
for i, name in enumerate(names):
f.write(f" {i}: {name}\n")
LOG.info("YOLO export complete: %s (%d tiles)", data_yaml, total_written)
return data_yaml
def merge_yolo_datasets(src_dirs: list[str | Path], out_dir: str | Path) -> Path:
out_dir = Path(out_dir)
for split in ["train", "val"]:
ensure_dir(out_dir / "images" / split)
ensure_dir(out_dir / "labels" / split)
for src in src_dirs:
src = Path(src)
for split in ["train", "val"]:
for img in (src / "images" / split).glob("*.jpg"):
shutil.copy2(img, out_dir / "images" / split / img.name)
for lab in (src / "labels" / split).glob("*.txt"):
shutil.copy2(lab, out_dir / "labels" / split / lab.name)
return out_dir

116
bgtopo_poc/georef.py Normal file
View File

@@ -0,0 +1,116 @@
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple
import numpy as np
import rasterio
from rasterio.errors import RasterioIOError
from rasterio.transform import rowcol, xy
from rasterio.windows import Window
LOG = logging.getLogger(__name__)
@dataclass
class RasterHandle:
path: Path
dataset: rasterio.io.DatasetReader
used_map_file: bool
@property
def width(self) -> int:
return self.dataset.width
@property
def height(self) -> int:
return self.dataset.height
@property
def crs(self):
return self.dataset.crs
@property
def transform(self):
return self.dataset.transform
def close(self) -> None:
self.dataset.close()
def open_georaster(map_path: Optional[str | Path] = None, tif_path: Optional[str | Path] = None) -> RasterHandle:
"""Open .map if possible, otherwise .tif. .map gives georeferencing through GDAL's MAP driver."""
last_err = None
if map_path:
try:
p = Path(map_path)
ds = rasterio.open(p)
LOG.info("Opened georeferenced MAP dataset: %s", p)
return RasterHandle(p, ds, used_map_file=True)
except Exception as e: # noqa: BLE001
last_err = e
LOG.warning("Could not open MAP dataset %s: %s", map_path, e)
if tif_path:
try:
p = Path(tif_path)
ds = rasterio.open(p)
LOG.info("Opened TIFF dataset: %s", p)
return RasterHandle(p, ds, used_map_file=False)
except RasterioIOError as e:
last_err = e
raise RuntimeError(f"Could not open raster. Last error: {last_err}")
def read_window_rgb(ds: rasterio.io.DatasetReader, window: Window) -> np.ndarray:
"""Read a raster window as uint8 RGB HxWx3."""
arr = ds.read(window=window, boundless=True, fill_value=255)
if arr.ndim != 3:
raise ValueError(f"Expected band-first array, got shape={arr.shape}")
if arr.shape[0] >= 3:
arr = arr[:3]
elif arr.shape[0] == 1:
arr = np.repeat(arr, 3, axis=0)
arr = np.moveaxis(arr, 0, -1)
if arr.dtype != np.uint8:
arr = np.clip(arr, 0, 255).astype(np.uint8)
return arr
def iter_windows(width: int, height: int, tile_size: int, overlap: int):
step = max(1, tile_size - overlap)
y = 0
while y < height:
x = 0
h = min(tile_size, height - y)
while x < width:
w = min(tile_size, width - x)
yield Window(x, y, w, h)
if x + tile_size >= width:
break
x += step
if y + tile_size >= height:
break
y += step
def lonlat_to_pixel(ds: rasterio.io.DatasetReader, lon: float, lat: float) -> Tuple[int, int]:
"""Convert lon/lat to row/col. Assumes the raster CRS accepts lon/lat or GDAL handles geographic transform.
For a production version, reproject coordinates into ds.crs with pyproj first. The PoC does that in coordinates.py.
"""
row, col = rowcol(ds.transform, lon, lat)
return int(row), int(col)
def pixel_to_lonlat(ds: rasterio.io.DatasetReader, row: int, col: int) -> Tuple[float, float]:
x, y = xy(ds.transform, row, col, offset="center")
return float(x), float(y)
def has_real_georef(ds: rasterio.io.DatasetReader) -> bool:
try:
return ds.crs is not None and not ds.transform.is_identity
except Exception:
return False

115
bgtopo_poc/inventory.py Normal file
View File

@@ -0,0 +1,115 @@
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Iterable, List, Optional
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from .utils import ensure_dir
LOG = logging.getLogger(__name__)
@dataclass
class SheetAsset:
sheet_id: str
map_url: Optional[str]
tif_url: Optional[str]
map_path: Optional[str] = None
tif_path: Optional[str] = None
def to_dict(self):
return asdict(self)
def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
"""Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
LOG.info("Discovering assets from %s", base_url)
html = requests.get(base_url, timeout=60).text
soup = BeautifulSoup(html, "html.parser")
hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]
by_sheet: dict[str, SheetAsset] = {}
for href in hrefs:
if href.startswith("?") or href.startswith("/") or href == "../":
continue
if not include_100k and "100k" in href.lower():
continue
if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
continue
sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
full_url = urljoin(base_url, href)
if href.lower().endswith(".map"):
item.map_url = full_url
else:
item.tif_url = full_url
assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
assets.sort(key=lambda x: x.sheet_id)
LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
return assets
def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
rows = [a.to_dict() for a in assets]
out_csv = Path(out_csv)
ensure_dir(out_csv.parent)
pd.DataFrame(rows).to_csv(out_csv, index=False)
LOG.info("Wrote manifest: %s", out_csv)
return out_csv
def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
df = pd.read_csv(path).fillna("")
assets: List[SheetAsset] = []
for _, r in df.iterrows():
assets.append(
SheetAsset(
sheet_id=str(r["sheet_id"]),
map_url=str(r.get("map_url") or "") or None,
tif_url=str(r.get("tif_url") or "") or None,
map_path=str(r.get("map_path") or "") or None,
tif_path=str(r.get("tif_path") or "") or None,
)
)
return assets
def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
return out_path
ensure_dir(out_path.parent)
with requests.get(url, stream=True, timeout=120) as r:
r.raise_for_status()
total = int(r.headers.get("content-length", "0") or 0)
with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
for chunk in r.iter_content(chunk_size=1024 * 512):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
return out_path
def download_assets(
assets: List[SheetAsset],
out_dir: str | Path,
limit: Optional[int] = None,
overwrite: bool = False,
) -> List[SheetAsset]:
out_dir = Path(out_dir)
selected = assets[:limit] if limit else assets
for item in selected:
sheet_dir = out_dir / item.sheet_id
if item.map_url:
item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
if item.tif_url:
item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
return selected

80
bgtopo_poc/report.py Normal file
View File

@@ -0,0 +1,80 @@
from __future__ import annotations
from pathlib import Path
import pandas as pd
from jinja2 import Template
from .utils import ensure_dir
REPORT_TEMPLATE = """
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<title>BGtopoVJ Blue Box PoC Report</title>
<style>
body { font-family: Arial, sans-serif; max-width: 1200px; margin: 24px auto; line-height: 1.45; }
table { border-collapse: collapse; width: 100%; margin: 16px 0; }
th, td { border: 1px solid #ddd; padding: 8px; font-size: 14px; }
th { background: #f3f3f3; text-align: left; }
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 18px; }
.card { border: 1px solid #ddd; border-radius: 12px; padding: 12px; }
img { max-width: 100%; border: 1px solid #ccc; }
code { background: #f6f6f6; padding: 2px 4px; }
</style>
</head>
<body>
<h1>BGtopoVJ Blue Rectangle/Square PoC Report</h1>
<p>This is a weak-label mining report. Treat candidates as review targets, not truth.</p>
<h2>Candidate summary</h2>
<table>
<tr><th>Metric</th><th>Value</th></tr>
<tr><td>Total candidates</td><td>{{ total }}</td></tr>
<tr><td>Average score</td><td>{{ avg_score }}</td></tr>
<tr><td>Median score</td><td>{{ med_score }}</td></tr>
</table>
<h2>By inferred fill style</h2>
{{ style_table }}
<h2>QA overlays</h2>
<div class="grid">
{% for overlay in overlays %}
<div class="card">
<p><code>{{ overlay.name }}</code></p>
<img src="{{ overlay.rel }}" />
</div>
{% endfor %}
</div>
</body>
</html>
"""
def build_report(candidate_csvs: list[str | Path], overlays: list[str | Path], out_html: str | Path) -> Path:
frames = [pd.read_csv(p) for p in candidate_csvs if Path(p).exists()]
df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
out_html = Path(out_html)
ensure_dir(out_html.parent)
style_table = "<p>No candidates.</p>"
if not df.empty:
style_table = df.groupby("fill_style").agg(count=("fill_style", "size"), avg_score=("score", "mean")).reset_index().to_html(index=False)
overlay_items = []
for ov in overlays:
ov = Path(ov)
try:
rel = ov.relative_to(out_html.parent)
except ValueError:
rel = ov
overlay_items.append({"name": ov.name, "rel": str(rel).replace("\\", "/")})
html = Template(REPORT_TEMPLATE).render(
total=0 if df.empty else len(df),
avg_score="" if df.empty else f"{df['score'].mean():.3f}",
med_score="" if df.empty else f"{df['score'].median():.3f}",
style_table=style_table,
overlays=overlay_items,
)
out_html.write_text(html, encoding="utf-8")
return out_html

40
bgtopo_poc/train_yolo.py Normal file
View File

@@ -0,0 +1,40 @@
from __future__ import annotations
import logging
from pathlib import Path
LOG = logging.getLogger(__name__)
def train_yolo(data_yaml: str | Path, model: str = "yolov8s.pt", imgsz: int = 1024, epochs: int = 80, batch: int = 4, device: str = "0"):
"""Train YOLO on the generated weak-label dataset.
This function imports ultralytics lazily so the rest of the PoC works without GPU dependencies.
Review/correct the weak labels before treating this model as useful.
"""
from ultralytics import YOLO
yolo = YOLO(model)
LOG.info("Starting YOLO training: model=%s data=%s imgsz=%d epochs=%d batch=%d device=%s", model, data_yaml, imgsz, epochs, batch, device)
return yolo.train(
data=str(data_yaml),
imgsz=imgsz,
epochs=epochs,
batch=batch,
device=device,
workers=4,
cache=False,
patience=20,
project="runs/bgtopo_bluebox",
name=f"{Path(data_yaml).parent.name}_{Path(model).stem}",
hsv_h=0.005,
hsv_s=0.20,
hsv_v=0.18,
degrees=0.0,
translate=0.05,
scale=0.20,
fliplr=0.5,
flipud=0.5,
mosaic=0.25,
close_mosaic=15,
)

43
bgtopo_poc/utils.py Normal file
View File

@@ -0,0 +1,43 @@
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Any, Dict
import yaml
def setup_logging(verbose: bool = False) -> None:
logging.basicConfig(
level=logging.DEBUG if verbose else logging.INFO,
format="%(asctime)s | %(levelname)-8s | %(message)s",
datefmt="%H:%M:%S",
)
def load_yaml(path: str | Path) -> Dict[str, Any]:
with open(path, "r", encoding="utf-8") as f:
return yaml.safe_load(f) or {}
def ensure_dir(path: str | Path) -> Path:
p = Path(path)
p.mkdir(parents=True, exist_ok=True)
return p
def write_json(path: str | Path, data: Any) -> None:
p = Path(path)
ensure_dir(p.parent)
with open(p, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def read_json(path: str | Path) -> Any:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def safe_stem(name: str) -> str:
return Path(name).stem.replace("/", "_").replace("\\", "_")