This commit is contained in:
nq
2026-05-03 21:58:47 +03:00
parent fd44235ff4
commit e45d1cb6b4
18 changed files with 1259 additions and 0 deletions

1
bgtopo_poc/__init__.py Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.1.0"

178
bgtopo_poc/cli.py Normal file
View File

@@ -0,0 +1,178 @@
from __future__ import annotations
import argparse
import logging
from pathlib import Path
from .coordinates import extract_coordinate_crops, score_coordinates_for_sheet
from .detector_cv import detect_sheet, draw_overlay
from .export_yolo import export_candidates_to_yolo
from .inventory import discover_original_assets, download_assets, read_manifest_csv, write_manifest_csv
from .report import build_report
from .train_yolo import train_yolo
from .utils import load_yaml, setup_logging
LOG = logging.getLogger(__name__)
def cmd_inventory(args):
cfg = load_yaml(args.config)
base_url = args.base_url or cfg["source"]["base_url"]
assets = discover_original_assets(base_url=base_url, include_100k=args.include_100k)
if args.limit:
assets = assets[: args.limit]
write_manifest_csv(assets, args.out)
def cmd_download(args):
assets = read_manifest_csv(args.manifest)
selected = download_assets(assets, args.out_dir, limit=args.limit, overwrite=args.overwrite)
write_manifest_csv(selected, args.out_manifest)
def cmd_detect(args):
cfg = load_yaml(args.config)
detect_sheet(args.map, args.tif, args.sheet_id, cfg, args.out_dir)
def cmd_overlay(args):
draw_overlay(args.tif, args.candidates, args.out)
def cmd_score_coords(args):
cfg = load_yaml(args.config)
score_coordinates_for_sheet(
coord_csv=args.coordinates,
candidates_csv=args.candidates,
map_path=args.map,
tif_path=args.tif,
sheet_id=args.sheet_id,
cfg=cfg,
out_dir=args.out_dir,
coord_crs=args.coord_crs,
)
def cmd_crops(args):
extract_coordinate_crops(
coord_scores_csv=args.scores,
map_path=args.map,
tif_path=args.tif,
out_dir=args.out_dir,
crop_size=args.crop_size,
)
def cmd_export_yolo(args):
cfg = load_yaml(args.config)
export_candidates_to_yolo(
tif_path=args.tif,
candidates_csv=args.candidates,
out_dir=args.out_dir,
cfg=cfg,
sheet_id=args.sheet_id,
tile_size=args.tile_size,
overlap=args.overlap,
val_fraction=args.val_fraction,
)
def cmd_train_yolo(args):
train_yolo(args.data_yaml, model=args.model, imgsz=args.imgsz, epochs=args.epochs, batch=args.batch, device=args.device)
def cmd_report(args):
build_report([Path(p) for p in args.candidates], [Path(p) for p in args.overlays], args.out)
def build_parser():
p = argparse.ArgumentParser(prog="bgtopo-bluebox", description="BGtopoVJ blue rectangle/square PoC pipeline")
p.add_argument("--verbose", action="store_true")
sub = p.add_subparsers(dest="cmd", required=True)
s = sub.add_parser("inventory", help="Crawl original raster directory and create manifest")
s.add_argument("--config", default="configs/blue_detector.yaml")
s.add_argument("--base-url", default=None)
s.add_argument("--out", default="data/manifest.csv")
s.add_argument("--limit", type=int, default=None)
s.add_argument("--include-100k", action="store_true")
s.set_defaults(func=cmd_inventory)
s = sub.add_parser("download", help="Download .map/.tif pairs from manifest")
s.add_argument("--manifest", default="data/manifest.csv")
s.add_argument("--out-dir", default="data/raw")
s.add_argument("--out-manifest", default="data/manifest_downloaded.csv")
s.add_argument("--limit", type=int, default=2)
s.add_argument("--overwrite", action="store_true")
s.set_defaults(func=cmd_download)
s = sub.add_parser("detect", help="Detect blue rectangle/square candidates on one sheet")
s.add_argument("--config", default="configs/blue_detector.yaml")
s.add_argument("--sheet-id", required=True)
s.add_argument("--map", default=None)
s.add_argument("--tif", required=True)
s.add_argument("--out-dir", default="data/interim/candidates")
s.set_defaults(func=cmd_detect)
s = sub.add_parser("overlay", help="Draw candidate overlay for manual QA")
s.add_argument("--tif", required=True)
s.add_argument("--candidates", required=True)
s.add_argument("--out", required=True)
s.set_defaults(func=cmd_overlay)
s = sub.add_parser("score-coords", help="Score coordinate CSV against candidates from one sheet")
s.add_argument("--config", default="configs/blue_detector.yaml")
s.add_argument("--sheet-id", required=True)
s.add_argument("--coordinates", required=True)
s.add_argument("--candidates", required=True)
s.add_argument("--map", default=None)
s.add_argument("--tif", required=True)
s.add_argument("--out-dir", default="data/interim/coordinate_scores")
s.add_argument("--coord-crs", default="EPSG:4326")
s.set_defaults(func=cmd_score_coords)
s = sub.add_parser("crops", help="Extract review crops around scored coordinates")
s.add_argument("--scores", required=True)
s.add_argument("--map", default=None)
s.add_argument("--tif", required=True)
s.add_argument("--out-dir", default="data/interim/crops")
s.add_argument("--crop-size", type=int, default=256)
s.set_defaults(func=cmd_crops)
s = sub.add_parser("export-yolo", help="Export weak candidates to YOLO dataset format")
s.add_argument("--config", default="configs/blue_detector.yaml")
s.add_argument("--sheet-id", required=True)
s.add_argument("--tif", required=True)
s.add_argument("--candidates", required=True)
s.add_argument("--out-dir", default="data/yolo/bluebox")
s.add_argument("--tile-size", type=int, default=1024)
s.add_argument("--overlap", type=int, default=128)
s.add_argument("--val-fraction", type=float, default=0.20)
s.set_defaults(func=cmd_export_yolo)
s = sub.add_parser("train-yolo", help="Train YOLO on exported dataset")
s.add_argument("--data-yaml", required=True)
s.add_argument("--model", default="yolov8s.pt")
s.add_argument("--imgsz", type=int, default=1024)
s.add_argument("--epochs", type=int, default=80)
s.add_argument("--batch", type=int, default=4)
s.add_argument("--device", default="0")
s.set_defaults(func=cmd_train_yolo)
s = sub.add_parser("report", help="Build HTML QA report")
s.add_argument("--candidates", nargs="+", required=True)
s.add_argument("--overlays", nargs="*", default=[])
s.add_argument("--out", default="reports/poc_report.html")
s.set_defaults(func=cmd_report)
return p
def main():
parser = build_parser()
args = parser.parse_args()
setup_logging(args.verbose)
args.func(args)
if __name__ == "__main__":
main()

150
bgtopo_poc/coordinates.py Normal file
View File

@@ -0,0 +1,150 @@
from __future__ import annotations
import logging
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
from pyproj import Transformer
from rasterio.windows import Window
from .georef import open_georaster, read_window_rgb
from .utils import ensure_dir
LOG = logging.getLogger(__name__)
def _normalize_coordinate_columns(df: pd.DataFrame) -> pd.DataFrame:
cols = {c.lower().strip(): c for c in df.columns}
lat_col = cols.get("lat") or cols.get("latitude") or cols.get("y")
lon_col = cols.get("lon") or cols.get("lng") or cols.get("longitude") or cols.get("x")
if not lat_col or not lon_col:
raise ValueError("Coordinate CSV needs lat/lon columns, or latitude/longitude, or y/x.")
out = df.copy()
out["lat"] = pd.to_numeric(out[lat_col], errors="coerce")
out["lon"] = pd.to_numeric(out[lon_col], errors="coerce")
if "id" not in out.columns:
out["id"] = [f"pt_{i:06d}" for i in range(len(out))]
return out.dropna(subset=["lat", "lon"])
def load_coordinates(path: str | Path) -> pd.DataFrame:
return _normalize_coordinate_columns(pd.read_csv(path))
def coord_to_rowcol(ds, lon: float, lat: float, coord_crs: str = "EPSG:4326") -> Optional[tuple[int, int]]:
if ds.crs is None:
return None
try:
transformer = Transformer.from_crs(coord_crs, ds.crs, always_xy=True)
x, y = transformer.transform(lon, lat)
row, col = ds.index(x, y)
return int(row), int(col)
except Exception as e: # noqa: BLE001
LOG.debug("coord_to_rowcol failed: %s", e)
return None
def score_coordinates_for_sheet(
coord_csv: str | Path,
candidates_csv: str | Path,
map_path: str | None,
tif_path: str | None,
sheet_id: str,
cfg: Dict,
out_dir: str | Path,
coord_crs: str = "EPSG:4326",
) -> Path:
out_dir = ensure_dir(out_dir)
coords = load_coordinates(coord_csv)
cands = pd.read_csv(candidates_csv)
rh = open_georaster(map_path=map_path, tif_path=tif_path)
radius = float(cfg["coordinate_scoring"].get("search_radius_px", 45))
try:
rows: List[dict] = []
for _, pt in coords.iterrows():
rc = coord_to_rowcol(rh.dataset, float(pt.lon), float(pt.lat), coord_crs=coord_crs)
if rc is None:
continue
row, col = rc
if row < 0 or col < 0 or row >= rh.height or col >= rh.width:
continue
if cands.empty:
nearest = None
else:
dx = cands["cx"].astype(float).to_numpy() - col
dy = cands["cy"].astype(float).to_numpy() - row
dist = np.sqrt(dx * dx + dy * dy)
i = int(np.argmin(dist))
nearest = (i, float(dist[i]))
score = 0.0
nearest_id = None
nearest_dist = None
nearest_style = None
nearest_det_score = None
decision = "auto_negative"
if nearest:
i, d = nearest
nearest_dist = d
if d <= radius:
nearest_id = i
nearest_style = str(cands.iloc[i].get("fill_style", "unknown"))
nearest_det_score = float(cands.iloc[i].get("score", 0.0))
dist_factor = max(0.0, 1.0 - d / radius)
score = float(0.55 * nearest_det_score + 0.45 * dist_factor)
if score >= float(cfg["coordinate_scoring"].get("strong_score", 0.90)):
decision = "auto_positive"
elif score >= float(cfg["coordinate_scoring"].get("weak_score", 0.40)):
decision = "review"
rows.append({
"id": pt.id,
"sheet_id": sheet_id,
"lat": float(pt.lat),
"lon": float(pt.lon),
"row": row,
"col": col,
"nearest_candidate_index": nearest_id,
"nearest_candidate_distance_px": nearest_dist,
"nearest_candidate_style": nearest_style,
"nearest_candidate_score": nearest_det_score,
"coordinate_score": score,
"decision": decision,
})
out_csv = Path(out_dir) / f"{sheet_id}_coordinate_scores.csv"
pd.DataFrame(rows).to_csv(out_csv, index=False)
LOG.info("Wrote coordinate scores: %s", out_csv)
return out_csv
finally:
rh.close()
def extract_coordinate_crops(
coord_scores_csv: str | Path,
map_path: str | None,
tif_path: str | None,
out_dir: str | Path,
crop_size: int = 256,
only_decisions: tuple[str, ...] = ("review", "auto_positive"),
) -> Path:
out_dir = ensure_dir(out_dir)
df = pd.read_csv(coord_scores_csv)
rh = open_georaster(map_path=map_path, tif_path=tif_path)
half = crop_size // 2
try:
for _, r in df.iterrows():
if str(r.decision) not in only_decisions:
continue
row, col = int(r.row), int(r.col)
win = Window(col - half, row - half, crop_size, crop_size)
rgb = read_window_rgb(rh.dataset, win)
img = Image.fromarray(rgb).convert("RGB")
draw = ImageDraw.Draw(img)
draw.ellipse([half - 5, half - 5, half + 5, half + 5], outline=(255, 0, 0), width=2)
name = f"{str(r.id)}__{str(r.decision)}__score_{float(r.coordinate_score):.3f}.png"
img.save(Path(out_dir) / name)
LOG.info("Wrote crops into: %s", out_dir)
return Path(out_dir)
finally:
rh.close()

223
bgtopo_poc/detector_cv.py Normal file
View File

@@ -0,0 +1,223 @@
from __future__ import annotations
import logging
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
import cv2
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
from rasterio.windows import Window
from tqdm import tqdm
from .georef import iter_windows, open_georaster, pixel_to_lonlat, read_window_rgb
from .utils import ensure_dir, safe_stem
LOG = logging.getLogger(__name__)
@dataclass
class Candidate:
sheet_id: str
source_path: str
x: int
y: int
w: int
h: int
cx: float
cy: float
area: float
aspect: float
blue_fill_ratio: float
rectangularity: float
solidity: float
approx_vertices: int
fill_style: str
score: float
lon: float | None = None
lat: float | None = None
def to_dict(self):
return asdict(self)
def build_blue_mask(rgb: np.ndarray, cfg: Dict) -> np.ndarray:
hsv = cv2.cvtColor(rgb, cv2.COLOR_RGB2HSV)
mask = np.zeros(hsv.shape[:2], dtype=np.uint8)
for r in cfg["detector"].get("hsv_ranges", []):
lower = np.array(r["lower"], dtype=np.uint8)
upper = np.array(r["upper"], dtype=np.uint8)
mask |= cv2.inRange(hsv, lower, upper)
morph = cfg["detector"].get("morphology", {})
open_k = int(morph.get("open_kernel", 0) or 0)
close_k = int(morph.get("close_kernel", 0) or 0)
if open_k > 1:
k = np.ones((open_k, open_k), np.uint8)
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, k)
if close_k > 1:
k = np.ones((close_k, close_k), np.uint8)
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, k)
dilate_iter = int(morph.get("dilate_iterations", 0) or 0)
if dilate_iter > 0:
mask = cv2.dilate(mask, np.ones((2, 2), np.uint8), iterations=dilate_iter)
return mask
def _classify_fill_style(blue_fill_ratio: float, rectangularity: float, solidity: float) -> str:
if blue_fill_ratio >= 0.48 and solidity >= 0.60:
return "filled"
if 0.10 <= blue_fill_ratio < 0.42 and rectangularity >= 0.28:
return "hollow"
if blue_fill_ratio < 0.18 and rectangularity >= 0.18:
return "border"
return "unknown"
def _score_candidate(blue_fill_ratio: float, rectangularity: float, solidity: float, aspect: float, approx_vertices: int) -> float:
aspect_bonus = 1.0 - min(abs(np.log(max(aspect, 1e-3))), 1.8) / 1.8
vertex_bonus = 1.0 if 4 <= approx_vertices <= 8 else 0.55
raw = 0.30 * blue_fill_ratio + 0.28 * rectangularity + 0.22 * solidity + 0.12 * aspect_bonus + 0.08 * vertex_bonus
return float(max(0.0, min(1.0, raw)))
def find_candidates_in_rgb(rgb: np.ndarray, cfg: Dict, sheet_id: str, source_path: str, xoff: int = 0, yoff: int = 0) -> List[Candidate]:
det = cfg["detector"]
mask = build_blue_mask(rgb, cfg)
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
out: List[Candidate] = []
for contour in contours:
area = float(cv2.contourArea(contour))
if area < det["min_area_px"] or area > det["max_area_px"]:
continue
x, y, w, h = cv2.boundingRect(contour)
if w < det["min_width_px"] or h < det["min_height_px"]:
continue
if w > det["max_width_px"] or h > det["max_height_px"]:
continue
aspect = float(w / max(h, 1))
if not (det["min_aspect"] <= aspect <= det["max_aspect"]):
continue
bbox_mask = mask[y:y + h, x:x + w]
blue_fill_ratio = float(np.count_nonzero(bbox_mask) / max(w * h, 1))
if not (det["min_blue_fill_ratio"] <= blue_fill_ratio <= det["max_blue_fill_ratio"]):
continue
rectangularity = float(area / max(w * h, 1))
if rectangularity < det["min_rectangularity"]:
continue
hull = cv2.convexHull(contour)
hull_area = float(cv2.contourArea(hull))
solidity = float(area / hull_area) if hull_area > 0 else 0.0
if solidity < det["min_solidity"]:
continue
peri = float(cv2.arcLength(contour, True))
approx = cv2.approxPolyDP(contour, 0.04 * peri, True)
fill_style = _classify_fill_style(blue_fill_ratio, rectangularity, solidity)
score = _score_candidate(blue_fill_ratio, rectangularity, solidity, aspect, len(approx))
out.append(
Candidate(
sheet_id=sheet_id,
source_path=source_path,
x=int(x + xoff),
y=int(y + yoff),
w=int(w),
h=int(h),
cx=float(x + xoff + w / 2),
cy=float(y + yoff + h / 2),
area=area,
aspect=aspect,
blue_fill_ratio=blue_fill_ratio,
rectangularity=rectangularity,
solidity=solidity,
approx_vertices=int(len(approx)),
fill_style=fill_style,
score=score,
)
)
return out
def bbox_iou(a: Candidate, b: Candidate) -> float:
ax1, ay1, ax2, ay2 = a.x, a.y, a.x + a.w, a.y + a.h
bx1, by1, bx2, by2 = b.x, b.y, b.x + b.w, b.y + b.h
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
iw, ih = max(0, ix2 - ix1), max(0, iy2 - iy1)
inter = iw * ih
union = a.w * a.h + b.w * b.h - inter
return float(inter / union) if union else 0.0
def nms(cands: Iterable[Candidate], threshold: float) -> List[Candidate]:
items = sorted(cands, key=lambda c: c.score, reverse=True)
kept: List[Candidate] = []
for c in items:
if all(bbox_iou(c, k) < threshold for k in kept):
kept.append(c)
return kept
def detect_sheet(map_path: str | None, tif_path: str | None, sheet_id: str, cfg: Dict, out_dir: str | Path) -> Path:
out_dir = ensure_dir(out_dir)
rh = open_georaster(map_path=map_path, tif_path=tif_path)
try:
tile_size = int(cfg["detector"]["tile_size"])
overlap = int(cfg["detector"]["tile_overlap"])
all_candidates: List[Candidate] = []
windows = list(iter_windows(rh.width, rh.height, tile_size, overlap))
LOG.info("Scanning %s as %d windows (%dx%d px)", sheet_id, len(windows), rh.width, rh.height)
for win in tqdm(windows, desc=f"scan {sheet_id}"):
rgb = read_window_rgb(rh.dataset, win)
cands = find_candidates_in_rgb(
rgb=rgb,
cfg=cfg,
sheet_id=sheet_id,
source_path=str(rh.path),
xoff=int(win.col_off),
yoff=int(win.row_off),
)
all_candidates.extend(cands)
kept = nms(all_candidates, float(cfg["detector"].get("nms_iou_threshold", 0.25)))
# Attach georeferenced centers when possible.
for c in kept:
if rh.crs is not None:
xgeo, ygeo = pixel_to_lonlat(rh.dataset, int(c.cy), int(c.cx))
c.lon = xgeo
c.lat = ygeo
out_csv = out_dir / f"{sheet_id}_candidates.csv"
pd.DataFrame([c.to_dict() for c in kept]).to_csv(out_csv, index=False)
LOG.info("%s: wrote %d candidates to %s", sheet_id, len(kept), out_csv)
return out_csv
finally:
rh.close()
def draw_overlay(tif_path: str | Path, candidates_csv: str | Path, out_png: str | Path, max_side: int = 2400) -> Path:
"""Draw candidates on a downscaled image. Uses TIFF path for simple visual QA."""
img = Image.open(tif_path).convert("RGB")
scale = min(1.0, max_side / max(img.size))
disp = img.resize((int(img.width * scale), int(img.height * scale))) if scale < 1 else img.copy()
draw = ImageDraw.Draw(disp)
df = pd.read_csv(candidates_csv)
color_by_style = {
"filled": (0, 255, 255),
"hollow": (0, 120, 255),
"border": (20, 20, 255),
"unknown": (255, 255, 0),
}
for _, r in df.iterrows():
color = color_by_style.get(str(r.get("fill_style", "unknown")), (255, 255, 0))
x1, y1 = float(r.x) * scale, float(r.y) * scale
x2, y2 = float(r.x + r.w) * scale, float(r.y + r.h) * scale
draw.rectangle([x1, y1, x2, y2], outline=color, width=max(1, int(2 * scale + 1)))
out_png = Path(out_png)
ensure_dir(out_png.parent)
disp.save(out_png)
LOG.info("Wrote overlay: %s", out_png)
return out_png

117
bgtopo_poc/export_yolo.py Normal file
View File

@@ -0,0 +1,117 @@
from __future__ import annotations
import logging
import random
import shutil
from pathlib import Path
from typing import Dict
import pandas as pd
from PIL import Image
from .utils import ensure_dir
LOG = logging.getLogger(__name__)
STYLE_TO_CLASS = {
"unknown": 0,
"filled": 1,
"hollow": 2,
"border": 3,
}
def _crop_image_and_labels(img: Image.Image, boxes: pd.DataFrame, x0: int, y0: int, size: int):
crop = img.crop((x0, y0, x0 + size, y0 + size)).convert("RGB")
labels = []
for _, r in boxes.iterrows():
bx1, by1, bx2, by2 = float(r.x), float(r.y), float(r.x + r.w), float(r.y + r.h)
ix1, iy1 = max(bx1, x0), max(by1, y0)
ix2, iy2 = min(bx2, x0 + size), min(by2, y0 + size)
if ix2 <= ix1 or iy2 <= iy1:
continue
visible_area = (ix2 - ix1) * (iy2 - iy1)
box_area = max((bx2 - bx1) * (by2 - by1), 1)
if visible_area / box_area < 0.35:
continue
cx = ((ix1 + ix2) / 2 - x0) / size
cy = ((iy1 + iy2) / 2 - y0) / size
w = (ix2 - ix1) / size
h = (iy2 - iy1) / size
cls = STYLE_TO_CLASS.get(str(r.get("fill_style", "unknown")), 0)
labels.append(f"{cls} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}")
return crop, labels
def export_candidates_to_yolo(
tif_path: str | Path,
candidates_csv: str | Path,
out_dir: str | Path,
cfg: Dict,
sheet_id: str,
tile_size: int = 1024,
overlap: int = 128,
val_fraction: float = 0.20,
include_empty_tiles: bool = True,
max_empty_tiles: int = 250,
) -> Path:
out_dir = Path(out_dir)
for split in ["train", "val"]:
ensure_dir(out_dir / "images" / split)
ensure_dir(out_dir / "labels" / split)
img = Image.open(tif_path).convert("RGB")
boxes = pd.read_csv(candidates_csv)
step = max(1, tile_size - overlap)
random.seed(42)
empty_written = 0
total_written = 0
for y0 in range(0, max(1, img.height - tile_size + 1), step):
for x0 in range(0, max(1, img.width - tile_size + 1), step):
in_tile = boxes[
(boxes.cx >= x0) & (boxes.cx < x0 + tile_size) &
(boxes.cy >= y0) & (boxes.cy < y0 + tile_size)
]
if in_tile.empty:
if not include_empty_tiles or empty_written >= max_empty_tiles:
continue
# Keep some empty/hard-negative tiles to stop the model from detecting all blue map details.
if random.random() > 0.08:
continue
empty_written += 1
crop, labels = _crop_image_and_labels(img, boxes, x0, y0, tile_size)
split = "val" if random.random() < val_fraction else "train"
stem = f"{sheet_id}_{x0}_{y0}"
crop.save(out_dir / "images" / split / f"{stem}.jpg", quality=92)
with open(out_dir / "labels" / split / f"{stem}.txt", "w", encoding="utf-8") as f:
f.write("\n".join(labels))
total_written += 1
data_yaml = out_dir / "data.yaml"
names = cfg.get("export", {}).get("yolo_class_names", ["blue_rect_unknown", "blue_rect_filled", "blue_rect_hollow", "blue_rect_border"])
with open(data_yaml, "w", encoding="utf-8") as f:
f.write(f"path: {out_dir.resolve()}\n")
f.write("train: images/train\n")
f.write("val: images/val\n")
f.write("names:\n")
for i, name in enumerate(names):
f.write(f" {i}: {name}\n")
LOG.info("YOLO export complete: %s (%d tiles)", data_yaml, total_written)
return data_yaml
def merge_yolo_datasets(src_dirs: list[str | Path], out_dir: str | Path) -> Path:
out_dir = Path(out_dir)
for split in ["train", "val"]:
ensure_dir(out_dir / "images" / split)
ensure_dir(out_dir / "labels" / split)
for src in src_dirs:
src = Path(src)
for split in ["train", "val"]:
for img in (src / "images" / split).glob("*.jpg"):
shutil.copy2(img, out_dir / "images" / split / img.name)
for lab in (src / "labels" / split).glob("*.txt"):
shutil.copy2(lab, out_dir / "labels" / split / lab.name)
return out_dir

116
bgtopo_poc/georef.py Normal file
View File

@@ -0,0 +1,116 @@
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple
import numpy as np
import rasterio
from rasterio.errors import RasterioIOError
from rasterio.transform import rowcol, xy
from rasterio.windows import Window
LOG = logging.getLogger(__name__)
@dataclass
class RasterHandle:
path: Path
dataset: rasterio.io.DatasetReader
used_map_file: bool
@property
def width(self) -> int:
return self.dataset.width
@property
def height(self) -> int:
return self.dataset.height
@property
def crs(self):
return self.dataset.crs
@property
def transform(self):
return self.dataset.transform
def close(self) -> None:
self.dataset.close()
def open_georaster(map_path: Optional[str | Path] = None, tif_path: Optional[str | Path] = None) -> RasterHandle:
"""Open .map if possible, otherwise .tif. .map gives georeferencing through GDAL's MAP driver."""
last_err = None
if map_path:
try:
p = Path(map_path)
ds = rasterio.open(p)
LOG.info("Opened georeferenced MAP dataset: %s", p)
return RasterHandle(p, ds, used_map_file=True)
except Exception as e: # noqa: BLE001
last_err = e
LOG.warning("Could not open MAP dataset %s: %s", map_path, e)
if tif_path:
try:
p = Path(tif_path)
ds = rasterio.open(p)
LOG.info("Opened TIFF dataset: %s", p)
return RasterHandle(p, ds, used_map_file=False)
except RasterioIOError as e:
last_err = e
raise RuntimeError(f"Could not open raster. Last error: {last_err}")
def read_window_rgb(ds: rasterio.io.DatasetReader, window: Window) -> np.ndarray:
"""Read a raster window as uint8 RGB HxWx3."""
arr = ds.read(window=window, boundless=True, fill_value=255)
if arr.ndim != 3:
raise ValueError(f"Expected band-first array, got shape={arr.shape}")
if arr.shape[0] >= 3:
arr = arr[:3]
elif arr.shape[0] == 1:
arr = np.repeat(arr, 3, axis=0)
arr = np.moveaxis(arr, 0, -1)
if arr.dtype != np.uint8:
arr = np.clip(arr, 0, 255).astype(np.uint8)
return arr
def iter_windows(width: int, height: int, tile_size: int, overlap: int):
step = max(1, tile_size - overlap)
y = 0
while y < height:
x = 0
h = min(tile_size, height - y)
while x < width:
w = min(tile_size, width - x)
yield Window(x, y, w, h)
if x + tile_size >= width:
break
x += step
if y + tile_size >= height:
break
y += step
def lonlat_to_pixel(ds: rasterio.io.DatasetReader, lon: float, lat: float) -> Tuple[int, int]:
"""Convert lon/lat to row/col. Assumes the raster CRS accepts lon/lat or GDAL handles geographic transform.
For a production version, reproject coordinates into ds.crs with pyproj first. The PoC does that in coordinates.py.
"""
row, col = rowcol(ds.transform, lon, lat)
return int(row), int(col)
def pixel_to_lonlat(ds: rasterio.io.DatasetReader, row: int, col: int) -> Tuple[float, float]:
x, y = xy(ds.transform, row, col, offset="center")
return float(x), float(y)
def has_real_georef(ds: rasterio.io.DatasetReader) -> bool:
try:
return ds.crs is not None and not ds.transform.is_identity
except Exception:
return False

115
bgtopo_poc/inventory.py Normal file
View File

@@ -0,0 +1,115 @@
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Iterable, List, Optional
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from .utils import ensure_dir
LOG = logging.getLogger(__name__)
@dataclass
class SheetAsset:
sheet_id: str
map_url: Optional[str]
tif_url: Optional[str]
map_path: Optional[str] = None
tif_path: Optional[str] = None
def to_dict(self):
return asdict(self)
def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
"""Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
LOG.info("Discovering assets from %s", base_url)
html = requests.get(base_url, timeout=60).text
soup = BeautifulSoup(html, "html.parser")
hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]
by_sheet: dict[str, SheetAsset] = {}
for href in hrefs:
if href.startswith("?") or href.startswith("/") or href == "../":
continue
if not include_100k and "100k" in href.lower():
continue
if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
continue
sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
full_url = urljoin(base_url, href)
if href.lower().endswith(".map"):
item.map_url = full_url
else:
item.tif_url = full_url
assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
assets.sort(key=lambda x: x.sheet_id)
LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
return assets
def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
rows = [a.to_dict() for a in assets]
out_csv = Path(out_csv)
ensure_dir(out_csv.parent)
pd.DataFrame(rows).to_csv(out_csv, index=False)
LOG.info("Wrote manifest: %s", out_csv)
return out_csv
def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
df = pd.read_csv(path).fillna("")
assets: List[SheetAsset] = []
for _, r in df.iterrows():
assets.append(
SheetAsset(
sheet_id=str(r["sheet_id"]),
map_url=str(r.get("map_url") or "") or None,
tif_url=str(r.get("tif_url") or "") or None,
map_path=str(r.get("map_path") or "") or None,
tif_path=str(r.get("tif_path") or "") or None,
)
)
return assets
def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
return out_path
ensure_dir(out_path.parent)
with requests.get(url, stream=True, timeout=120) as r:
r.raise_for_status()
total = int(r.headers.get("content-length", "0") or 0)
with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
for chunk in r.iter_content(chunk_size=1024 * 512):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
return out_path
def download_assets(
assets: List[SheetAsset],
out_dir: str | Path,
limit: Optional[int] = None,
overwrite: bool = False,
) -> List[SheetAsset]:
out_dir = Path(out_dir)
selected = assets[:limit] if limit else assets
for item in selected:
sheet_dir = out_dir / item.sheet_id
if item.map_url:
item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
if item.tif_url:
item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
return selected

80
bgtopo_poc/report.py Normal file
View File

@@ -0,0 +1,80 @@
from __future__ import annotations
from pathlib import Path
import pandas as pd
from jinja2 import Template
from .utils import ensure_dir
REPORT_TEMPLATE = """
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<title>BGtopoVJ Blue Box PoC Report</title>
<style>
body { font-family: Arial, sans-serif; max-width: 1200px; margin: 24px auto; line-height: 1.45; }
table { border-collapse: collapse; width: 100%; margin: 16px 0; }
th, td { border: 1px solid #ddd; padding: 8px; font-size: 14px; }
th { background: #f3f3f3; text-align: left; }
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 18px; }
.card { border: 1px solid #ddd; border-radius: 12px; padding: 12px; }
img { max-width: 100%; border: 1px solid #ccc; }
code { background: #f6f6f6; padding: 2px 4px; }
</style>
</head>
<body>
<h1>BGtopoVJ Blue Rectangle/Square PoC Report</h1>
<p>This is a weak-label mining report. Treat candidates as review targets, not truth.</p>
<h2>Candidate summary</h2>
<table>
<tr><th>Metric</th><th>Value</th></tr>
<tr><td>Total candidates</td><td>{{ total }}</td></tr>
<tr><td>Average score</td><td>{{ avg_score }}</td></tr>
<tr><td>Median score</td><td>{{ med_score }}</td></tr>
</table>
<h2>By inferred fill style</h2>
{{ style_table }}
<h2>QA overlays</h2>
<div class="grid">
{% for overlay in overlays %}
<div class="card">
<p><code>{{ overlay.name }}</code></p>
<img src="{{ overlay.rel }}" />
</div>
{% endfor %}
</div>
</body>
</html>
"""
def build_report(candidate_csvs: list[str | Path], overlays: list[str | Path], out_html: str | Path) -> Path:
frames = [pd.read_csv(p) for p in candidate_csvs if Path(p).exists()]
df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
out_html = Path(out_html)
ensure_dir(out_html.parent)
style_table = "<p>No candidates.</p>"
if not df.empty:
style_table = df.groupby("fill_style").agg(count=("fill_style", "size"), avg_score=("score", "mean")).reset_index().to_html(index=False)
overlay_items = []
for ov in overlays:
ov = Path(ov)
try:
rel = ov.relative_to(out_html.parent)
except ValueError:
rel = ov
overlay_items.append({"name": ov.name, "rel": str(rel).replace("\\", "/")})
html = Template(REPORT_TEMPLATE).render(
total=0 if df.empty else len(df),
avg_score="" if df.empty else f"{df['score'].mean():.3f}",
med_score="" if df.empty else f"{df['score'].median():.3f}",
style_table=style_table,
overlays=overlay_items,
)
out_html.write_text(html, encoding="utf-8")
return out_html

40
bgtopo_poc/train_yolo.py Normal file
View File

@@ -0,0 +1,40 @@
from __future__ import annotations
import logging
from pathlib import Path
LOG = logging.getLogger(__name__)
def train_yolo(data_yaml: str | Path, model: str = "yolov8s.pt", imgsz: int = 1024, epochs: int = 80, batch: int = 4, device: str = "0"):
"""Train YOLO on the generated weak-label dataset.
This function imports ultralytics lazily so the rest of the PoC works without GPU dependencies.
Review/correct the weak labels before treating this model as useful.
"""
from ultralytics import YOLO
yolo = YOLO(model)
LOG.info("Starting YOLO training: model=%s data=%s imgsz=%d epochs=%d batch=%d device=%s", model, data_yaml, imgsz, epochs, batch, device)
return yolo.train(
data=str(data_yaml),
imgsz=imgsz,
epochs=epochs,
batch=batch,
device=device,
workers=4,
cache=False,
patience=20,
project="runs/bgtopo_bluebox",
name=f"{Path(data_yaml).parent.name}_{Path(model).stem}",
hsv_h=0.005,
hsv_s=0.20,
hsv_v=0.18,
degrees=0.0,
translate=0.05,
scale=0.20,
fliplr=0.5,
flipud=0.5,
mosaic=0.25,
close_mosaic=15,
)

43
bgtopo_poc/utils.py Normal file
View File

@@ -0,0 +1,43 @@
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Any, Dict
import yaml
def setup_logging(verbose: bool = False) -> None:
logging.basicConfig(
level=logging.DEBUG if verbose else logging.INFO,
format="%(asctime)s | %(levelname)-8s | %(message)s",
datefmt="%H:%M:%S",
)
def load_yaml(path: str | Path) -> Dict[str, Any]:
with open(path, "r", encoding="utf-8") as f:
return yaml.safe_load(f) or {}
def ensure_dir(path: str | Path) -> Path:
p = Path(path)
p.mkdir(parents=True, exist_ok=True)
return p
def write_json(path: str | Path, data: Any) -> None:
p = Path(path)
ensure_dir(p.parent)
with open(p, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def read_json(path: str | Path) -> Any:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def safe_stem(name: str) -> str:
return Path(name).stem.replace("/", "_").replace("\\", "_")

View File

@@ -0,0 +1,47 @@
# Tuned to be permissive. Expect false positives from rivers, lakes, names and contours.
# The purpose is weak-label mining + hard-negative discovery, not final truth.
source:
base_url: "https://web.uni-plovdiv.net/vedrin/raster/original/"
default_crs: "EPSG:4326"
detector:
tile_size: 1400
tile_overlap: 96
min_width_px: 5
min_height_px: 5
max_width_px: 95
max_height_px: 95
min_area_px: 18
max_area_px: 6500
min_aspect: 0.35
max_aspect: 4.25
min_blue_fill_ratio: 0.045
max_blue_fill_ratio: 0.92
min_rectangularity: 0.18
min_solidity: 0.22
nms_iou_threshold: 0.25
# OpenCV HSV hue range is 0-179, not 0-360.
hsv_ranges:
- name: blue_dark_to_light
lower: [82, 28, 45]
upper: [137, 255, 255]
- name: pale_cyan_low_sat
lower: [78, 10, 105]
upper: [115, 115, 255]
morphology:
open_kernel: 2
close_kernel: 3
dilate_iterations: 1
coordinate_scoring:
search_radius_px: 45
strong_score: 0.90
weak_score: 0.40
crop_size_px: 256
export:
yolo_class_names:
- blue_rect_unknown
- blue_rect_filled
- blue_rect_hollow
- blue_rect_border

View File

@@ -0,0 +1,3 @@
id,lat,lon,expected
example_001,42.58837223,23.19638729,unknown
example_002,42.70000000,23.32000000,unknown
1 id lat lon expected
2 example_001 42.58837223 23.19638729 unknown
3 example_002 42.70000000 23.32000000 unknown

18
docker-compose.gpu.yml Normal file
View File

@@ -0,0 +1,18 @@
services:
bgtopo-bluebox:
build:
context: .
dockerfile: docker/Dockerfile.gpu
image: bgtopo-bluebox-poc:gpu
working_dir: /app
volumes:
- .:/app
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
stdin_open: true
tty: true

19
docker/Dockerfile.gpu Normal file
View File

@@ -0,0 +1,19 @@
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-venv git wget curl ca-certificates \
gdal-bin libgdal-dev python3-gdal libgl1 libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN python3 -m pip install --upgrade pip && \
python3 -m pip install --no-cache-dir -r requirements.txt
COPY . /app
ENV PYTHONPATH=/app
CMD ["bash"]

12
pyproject.toml Normal file
View File

@@ -0,0 +1,12 @@
[project]
name = "bgtopo-bluebox-poc"
version = "0.1.0"
description = "PoC pipeline for detecting blue square/rectangle symbols in BGtopoVJ rasters and scoring coordinate points."
requires-python = ">=3.10"
dependencies = []
[tool.setuptools]
packages = ["bgtopo_poc"]
[project.scripts]
bgtopo-bluebox = "bgtopo_poc.cli:main"

22
requirements.txt Normal file
View File

@@ -0,0 +1,22 @@
# Core GIS / raster stack
rasterio>=1.3.9
pyproj>=3.6.1
shapely>=2.0.2
geopandas>=0.14.4
fiona>=1.9.6
# CV / ML
opencv-python>=4.9.0
numpy>=1.26.4
pandas>=2.2.1
Pillow>=10.2.0
scikit-learn>=1.4.1
ultralytics>=8.2.0
PyYAML>=6.0.1
tqdm>=4.66.2
requests>=2.31.0
beautifulsoup4>=4.12.3
# Reports / QA
matplotlib>=3.8.3
jinja2>=3.1.3

49
scripts/run_pilot.sh Normal file
View File

@@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -euo pipefail
# Run from repository root.
# First run downloads two sheets only. Increase --limit after confirming overlays look sane.
python -m bgtopo_poc.cli inventory \
--config configs/blue_detector.yaml \
--out data/manifest.csv \
--limit 2
python -m bgtopo_poc.cli download \
--manifest data/manifest.csv \
--out-dir data/raw \
--out-manifest data/manifest_downloaded.csv \
--limit 2
python - <<'PY'
import pandas as pd
from pathlib import Path
m = pd.read_csv('data/manifest_downloaded.csv')
for _, r in m.iterrows():
sid = r['sheet_id']
Path('data/interim/candidates').mkdir(parents=True, exist_ok=True)
print(f"Scanning {sid}")
PY
while IFS=, read -r sheet_id map_url tif_url map_path tif_path; do
if [[ "$sheet_id" == "sheet_id" ]]; then continue; fi
python -m bgtopo_poc.cli detect \
--config configs/blue_detector.yaml \
--sheet-id "$sheet_id" \
--map "$map_path" \
--tif "$tif_path" \
--out-dir data/interim/candidates
python -m bgtopo_poc.cli overlay \
--tif "$tif_path" \
--candidates "data/interim/candidates/${sheet_id}_candidates.csv" \
--out "reports/overlays/${sheet_id}_overlay.png"
done < data/manifest_downloaded.csv
python -m bgtopo_poc.cli report \
--candidates data/interim/candidates/*_candidates.csv \
--overlays reports/overlays/*_overlay.png \
--out reports/poc_report.html
echo "Open reports/poc_report.html"

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
# Pick one downloaded sheet/candidate file first.
SHEET_ID="${1:-K-34-009-2}"
TIF="data/raw/${SHEET_ID}/${SHEET_ID}.tif"
CAND="data/interim/candidates/${SHEET_ID}_candidates.csv"
OUT="data/yolo/${SHEET_ID}"
python -m bgtopo_poc.cli export-yolo \
--config configs/blue_detector.yaml \
--sheet-id "$SHEET_ID" \
--tif "$TIF" \
--candidates "$CAND" \
--out-dir "$OUT" \
--tile-size 1024 \
--overlap 128
# RTX 3080 FE 10 GB: start batch 2-4 at imgsz=1024. Raise only if VRAM allows it.
python -m bgtopo_poc.cli train-yolo \
--data-yaml "$OUT/data.yaml" \
--model yolov8s.pt \
--imgsz 1024 \
--epochs 80 \
--batch 4 \
--device 0