rnd-v1
This commit is contained in:
1
bgtopo_poc/__init__.py
Normal file
1
bgtopo_poc/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "0.1.0"
|
||||
178
bgtopo_poc/cli.py
Normal file
178
bgtopo_poc/cli.py
Normal file
@@ -0,0 +1,178 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from .coordinates import extract_coordinate_crops, score_coordinates_for_sheet
|
||||
from .detector_cv import detect_sheet, draw_overlay
|
||||
from .export_yolo import export_candidates_to_yolo
|
||||
from .inventory import discover_original_assets, download_assets, read_manifest_csv, write_manifest_csv
|
||||
from .report import build_report
|
||||
from .train_yolo import train_yolo
|
||||
from .utils import load_yaml, setup_logging
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def cmd_inventory(args):
|
||||
cfg = load_yaml(args.config)
|
||||
base_url = args.base_url or cfg["source"]["base_url"]
|
||||
assets = discover_original_assets(base_url=base_url, include_100k=args.include_100k)
|
||||
if args.limit:
|
||||
assets = assets[: args.limit]
|
||||
write_manifest_csv(assets, args.out)
|
||||
|
||||
|
||||
def cmd_download(args):
|
||||
assets = read_manifest_csv(args.manifest)
|
||||
selected = download_assets(assets, args.out_dir, limit=args.limit, overwrite=args.overwrite)
|
||||
write_manifest_csv(selected, args.out_manifest)
|
||||
|
||||
|
||||
def cmd_detect(args):
|
||||
cfg = load_yaml(args.config)
|
||||
detect_sheet(args.map, args.tif, args.sheet_id, cfg, args.out_dir)
|
||||
|
||||
|
||||
def cmd_overlay(args):
|
||||
draw_overlay(args.tif, args.candidates, args.out)
|
||||
|
||||
|
||||
def cmd_score_coords(args):
|
||||
cfg = load_yaml(args.config)
|
||||
score_coordinates_for_sheet(
|
||||
coord_csv=args.coordinates,
|
||||
candidates_csv=args.candidates,
|
||||
map_path=args.map,
|
||||
tif_path=args.tif,
|
||||
sheet_id=args.sheet_id,
|
||||
cfg=cfg,
|
||||
out_dir=args.out_dir,
|
||||
coord_crs=args.coord_crs,
|
||||
)
|
||||
|
||||
|
||||
def cmd_crops(args):
|
||||
extract_coordinate_crops(
|
||||
coord_scores_csv=args.scores,
|
||||
map_path=args.map,
|
||||
tif_path=args.tif,
|
||||
out_dir=args.out_dir,
|
||||
crop_size=args.crop_size,
|
||||
)
|
||||
|
||||
|
||||
def cmd_export_yolo(args):
|
||||
cfg = load_yaml(args.config)
|
||||
export_candidates_to_yolo(
|
||||
tif_path=args.tif,
|
||||
candidates_csv=args.candidates,
|
||||
out_dir=args.out_dir,
|
||||
cfg=cfg,
|
||||
sheet_id=args.sheet_id,
|
||||
tile_size=args.tile_size,
|
||||
overlap=args.overlap,
|
||||
val_fraction=args.val_fraction,
|
||||
)
|
||||
|
||||
|
||||
def cmd_train_yolo(args):
|
||||
train_yolo(args.data_yaml, model=args.model, imgsz=args.imgsz, epochs=args.epochs, batch=args.batch, device=args.device)
|
||||
|
||||
|
||||
def cmd_report(args):
|
||||
build_report([Path(p) for p in args.candidates], [Path(p) for p in args.overlays], args.out)
|
||||
|
||||
|
||||
def build_parser():
|
||||
p = argparse.ArgumentParser(prog="bgtopo-bluebox", description="BGtopoVJ blue rectangle/square PoC pipeline")
|
||||
p.add_argument("--verbose", action="store_true")
|
||||
sub = p.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
s = sub.add_parser("inventory", help="Crawl original raster directory and create manifest")
|
||||
s.add_argument("--config", default="configs/blue_detector.yaml")
|
||||
s.add_argument("--base-url", default=None)
|
||||
s.add_argument("--out", default="data/manifest.csv")
|
||||
s.add_argument("--limit", type=int, default=None)
|
||||
s.add_argument("--include-100k", action="store_true")
|
||||
s.set_defaults(func=cmd_inventory)
|
||||
|
||||
s = sub.add_parser("download", help="Download .map/.tif pairs from manifest")
|
||||
s.add_argument("--manifest", default="data/manifest.csv")
|
||||
s.add_argument("--out-dir", default="data/raw")
|
||||
s.add_argument("--out-manifest", default="data/manifest_downloaded.csv")
|
||||
s.add_argument("--limit", type=int, default=2)
|
||||
s.add_argument("--overwrite", action="store_true")
|
||||
s.set_defaults(func=cmd_download)
|
||||
|
||||
s = sub.add_parser("detect", help="Detect blue rectangle/square candidates on one sheet")
|
||||
s.add_argument("--config", default="configs/blue_detector.yaml")
|
||||
s.add_argument("--sheet-id", required=True)
|
||||
s.add_argument("--map", default=None)
|
||||
s.add_argument("--tif", required=True)
|
||||
s.add_argument("--out-dir", default="data/interim/candidates")
|
||||
s.set_defaults(func=cmd_detect)
|
||||
|
||||
s = sub.add_parser("overlay", help="Draw candidate overlay for manual QA")
|
||||
s.add_argument("--tif", required=True)
|
||||
s.add_argument("--candidates", required=True)
|
||||
s.add_argument("--out", required=True)
|
||||
s.set_defaults(func=cmd_overlay)
|
||||
|
||||
s = sub.add_parser("score-coords", help="Score coordinate CSV against candidates from one sheet")
|
||||
s.add_argument("--config", default="configs/blue_detector.yaml")
|
||||
s.add_argument("--sheet-id", required=True)
|
||||
s.add_argument("--coordinates", required=True)
|
||||
s.add_argument("--candidates", required=True)
|
||||
s.add_argument("--map", default=None)
|
||||
s.add_argument("--tif", required=True)
|
||||
s.add_argument("--out-dir", default="data/interim/coordinate_scores")
|
||||
s.add_argument("--coord-crs", default="EPSG:4326")
|
||||
s.set_defaults(func=cmd_score_coords)
|
||||
|
||||
s = sub.add_parser("crops", help="Extract review crops around scored coordinates")
|
||||
s.add_argument("--scores", required=True)
|
||||
s.add_argument("--map", default=None)
|
||||
s.add_argument("--tif", required=True)
|
||||
s.add_argument("--out-dir", default="data/interim/crops")
|
||||
s.add_argument("--crop-size", type=int, default=256)
|
||||
s.set_defaults(func=cmd_crops)
|
||||
|
||||
s = sub.add_parser("export-yolo", help="Export weak candidates to YOLO dataset format")
|
||||
s.add_argument("--config", default="configs/blue_detector.yaml")
|
||||
s.add_argument("--sheet-id", required=True)
|
||||
s.add_argument("--tif", required=True)
|
||||
s.add_argument("--candidates", required=True)
|
||||
s.add_argument("--out-dir", default="data/yolo/bluebox")
|
||||
s.add_argument("--tile-size", type=int, default=1024)
|
||||
s.add_argument("--overlap", type=int, default=128)
|
||||
s.add_argument("--val-fraction", type=float, default=0.20)
|
||||
s.set_defaults(func=cmd_export_yolo)
|
||||
|
||||
s = sub.add_parser("train-yolo", help="Train YOLO on exported dataset")
|
||||
s.add_argument("--data-yaml", required=True)
|
||||
s.add_argument("--model", default="yolov8s.pt")
|
||||
s.add_argument("--imgsz", type=int, default=1024)
|
||||
s.add_argument("--epochs", type=int, default=80)
|
||||
s.add_argument("--batch", type=int, default=4)
|
||||
s.add_argument("--device", default="0")
|
||||
s.set_defaults(func=cmd_train_yolo)
|
||||
|
||||
s = sub.add_parser("report", help="Build HTML QA report")
|
||||
s.add_argument("--candidates", nargs="+", required=True)
|
||||
s.add_argument("--overlays", nargs="*", default=[])
|
||||
s.add_argument("--out", default="reports/poc_report.html")
|
||||
s.set_defaults(func=cmd_report)
|
||||
return p
|
||||
|
||||
|
||||
def main():
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
setup_logging(args.verbose)
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
150
bgtopo_poc/coordinates.py
Normal file
150
bgtopo_poc/coordinates.py
Normal file
@@ -0,0 +1,150 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from PIL import Image, ImageDraw
|
||||
from pyproj import Transformer
|
||||
from rasterio.windows import Window
|
||||
|
||||
from .georef import open_georaster, read_window_rgb
|
||||
from .utils import ensure_dir
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _normalize_coordinate_columns(df: pd.DataFrame) -> pd.DataFrame:
|
||||
cols = {c.lower().strip(): c for c in df.columns}
|
||||
lat_col = cols.get("lat") or cols.get("latitude") or cols.get("y")
|
||||
lon_col = cols.get("lon") or cols.get("lng") or cols.get("longitude") or cols.get("x")
|
||||
if not lat_col or not lon_col:
|
||||
raise ValueError("Coordinate CSV needs lat/lon columns, or latitude/longitude, or y/x.")
|
||||
out = df.copy()
|
||||
out["lat"] = pd.to_numeric(out[lat_col], errors="coerce")
|
||||
out["lon"] = pd.to_numeric(out[lon_col], errors="coerce")
|
||||
if "id" not in out.columns:
|
||||
out["id"] = [f"pt_{i:06d}" for i in range(len(out))]
|
||||
return out.dropna(subset=["lat", "lon"])
|
||||
|
||||
|
||||
def load_coordinates(path: str | Path) -> pd.DataFrame:
|
||||
return _normalize_coordinate_columns(pd.read_csv(path))
|
||||
|
||||
|
||||
def coord_to_rowcol(ds, lon: float, lat: float, coord_crs: str = "EPSG:4326") -> Optional[tuple[int, int]]:
|
||||
if ds.crs is None:
|
||||
return None
|
||||
try:
|
||||
transformer = Transformer.from_crs(coord_crs, ds.crs, always_xy=True)
|
||||
x, y = transformer.transform(lon, lat)
|
||||
row, col = ds.index(x, y)
|
||||
return int(row), int(col)
|
||||
except Exception as e: # noqa: BLE001
|
||||
LOG.debug("coord_to_rowcol failed: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def score_coordinates_for_sheet(
|
||||
coord_csv: str | Path,
|
||||
candidates_csv: str | Path,
|
||||
map_path: str | None,
|
||||
tif_path: str | None,
|
||||
sheet_id: str,
|
||||
cfg: Dict,
|
||||
out_dir: str | Path,
|
||||
coord_crs: str = "EPSG:4326",
|
||||
) -> Path:
|
||||
out_dir = ensure_dir(out_dir)
|
||||
coords = load_coordinates(coord_csv)
|
||||
cands = pd.read_csv(candidates_csv)
|
||||
rh = open_georaster(map_path=map_path, tif_path=tif_path)
|
||||
radius = float(cfg["coordinate_scoring"].get("search_radius_px", 45))
|
||||
try:
|
||||
rows: List[dict] = []
|
||||
for _, pt in coords.iterrows():
|
||||
rc = coord_to_rowcol(rh.dataset, float(pt.lon), float(pt.lat), coord_crs=coord_crs)
|
||||
if rc is None:
|
||||
continue
|
||||
row, col = rc
|
||||
if row < 0 or col < 0 or row >= rh.height or col >= rh.width:
|
||||
continue
|
||||
if cands.empty:
|
||||
nearest = None
|
||||
else:
|
||||
dx = cands["cx"].astype(float).to_numpy() - col
|
||||
dy = cands["cy"].astype(float).to_numpy() - row
|
||||
dist = np.sqrt(dx * dx + dy * dy)
|
||||
i = int(np.argmin(dist))
|
||||
nearest = (i, float(dist[i]))
|
||||
score = 0.0
|
||||
nearest_id = None
|
||||
nearest_dist = None
|
||||
nearest_style = None
|
||||
nearest_det_score = None
|
||||
decision = "auto_negative"
|
||||
if nearest:
|
||||
i, d = nearest
|
||||
nearest_dist = d
|
||||
if d <= radius:
|
||||
nearest_id = i
|
||||
nearest_style = str(cands.iloc[i].get("fill_style", "unknown"))
|
||||
nearest_det_score = float(cands.iloc[i].get("score", 0.0))
|
||||
dist_factor = max(0.0, 1.0 - d / radius)
|
||||
score = float(0.55 * nearest_det_score + 0.45 * dist_factor)
|
||||
if score >= float(cfg["coordinate_scoring"].get("strong_score", 0.90)):
|
||||
decision = "auto_positive"
|
||||
elif score >= float(cfg["coordinate_scoring"].get("weak_score", 0.40)):
|
||||
decision = "review"
|
||||
rows.append({
|
||||
"id": pt.id,
|
||||
"sheet_id": sheet_id,
|
||||
"lat": float(pt.lat),
|
||||
"lon": float(pt.lon),
|
||||
"row": row,
|
||||
"col": col,
|
||||
"nearest_candidate_index": nearest_id,
|
||||
"nearest_candidate_distance_px": nearest_dist,
|
||||
"nearest_candidate_style": nearest_style,
|
||||
"nearest_candidate_score": nearest_det_score,
|
||||
"coordinate_score": score,
|
||||
"decision": decision,
|
||||
})
|
||||
out_csv = Path(out_dir) / f"{sheet_id}_coordinate_scores.csv"
|
||||
pd.DataFrame(rows).to_csv(out_csv, index=False)
|
||||
LOG.info("Wrote coordinate scores: %s", out_csv)
|
||||
return out_csv
|
||||
finally:
|
||||
rh.close()
|
||||
|
||||
|
||||
def extract_coordinate_crops(
|
||||
coord_scores_csv: str | Path,
|
||||
map_path: str | None,
|
||||
tif_path: str | None,
|
||||
out_dir: str | Path,
|
||||
crop_size: int = 256,
|
||||
only_decisions: tuple[str, ...] = ("review", "auto_positive"),
|
||||
) -> Path:
|
||||
out_dir = ensure_dir(out_dir)
|
||||
df = pd.read_csv(coord_scores_csv)
|
||||
rh = open_georaster(map_path=map_path, tif_path=tif_path)
|
||||
half = crop_size // 2
|
||||
try:
|
||||
for _, r in df.iterrows():
|
||||
if str(r.decision) not in only_decisions:
|
||||
continue
|
||||
row, col = int(r.row), int(r.col)
|
||||
win = Window(col - half, row - half, crop_size, crop_size)
|
||||
rgb = read_window_rgb(rh.dataset, win)
|
||||
img = Image.fromarray(rgb).convert("RGB")
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.ellipse([half - 5, half - 5, half + 5, half + 5], outline=(255, 0, 0), width=2)
|
||||
name = f"{str(r.id)}__{str(r.decision)}__score_{float(r.coordinate_score):.3f}.png"
|
||||
img.save(Path(out_dir) / name)
|
||||
LOG.info("Wrote crops into: %s", out_dir)
|
||||
return Path(out_dir)
|
||||
finally:
|
||||
rh.close()
|
||||
223
bgtopo_poc/detector_cv.py
Normal file
223
bgtopo_poc/detector_cv.py
Normal file
@@ -0,0 +1,223 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from PIL import Image, ImageDraw
|
||||
from rasterio.windows import Window
|
||||
from tqdm import tqdm
|
||||
|
||||
from .georef import iter_windows, open_georaster, pixel_to_lonlat, read_window_rgb
|
||||
from .utils import ensure_dir, safe_stem
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Candidate:
|
||||
sheet_id: str
|
||||
source_path: str
|
||||
x: int
|
||||
y: int
|
||||
w: int
|
||||
h: int
|
||||
cx: float
|
||||
cy: float
|
||||
area: float
|
||||
aspect: float
|
||||
blue_fill_ratio: float
|
||||
rectangularity: float
|
||||
solidity: float
|
||||
approx_vertices: int
|
||||
fill_style: str
|
||||
score: float
|
||||
lon: float | None = None
|
||||
lat: float | None = None
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
def build_blue_mask(rgb: np.ndarray, cfg: Dict) -> np.ndarray:
|
||||
hsv = cv2.cvtColor(rgb, cv2.COLOR_RGB2HSV)
|
||||
mask = np.zeros(hsv.shape[:2], dtype=np.uint8)
|
||||
for r in cfg["detector"].get("hsv_ranges", []):
|
||||
lower = np.array(r["lower"], dtype=np.uint8)
|
||||
upper = np.array(r["upper"], dtype=np.uint8)
|
||||
mask |= cv2.inRange(hsv, lower, upper)
|
||||
morph = cfg["detector"].get("morphology", {})
|
||||
open_k = int(morph.get("open_kernel", 0) or 0)
|
||||
close_k = int(morph.get("close_kernel", 0) or 0)
|
||||
if open_k > 1:
|
||||
k = np.ones((open_k, open_k), np.uint8)
|
||||
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, k)
|
||||
if close_k > 1:
|
||||
k = np.ones((close_k, close_k), np.uint8)
|
||||
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, k)
|
||||
dilate_iter = int(morph.get("dilate_iterations", 0) or 0)
|
||||
if dilate_iter > 0:
|
||||
mask = cv2.dilate(mask, np.ones((2, 2), np.uint8), iterations=dilate_iter)
|
||||
return mask
|
||||
|
||||
|
||||
def _classify_fill_style(blue_fill_ratio: float, rectangularity: float, solidity: float) -> str:
|
||||
if blue_fill_ratio >= 0.48 and solidity >= 0.60:
|
||||
return "filled"
|
||||
if 0.10 <= blue_fill_ratio < 0.42 and rectangularity >= 0.28:
|
||||
return "hollow"
|
||||
if blue_fill_ratio < 0.18 and rectangularity >= 0.18:
|
||||
return "border"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _score_candidate(blue_fill_ratio: float, rectangularity: float, solidity: float, aspect: float, approx_vertices: int) -> float:
|
||||
aspect_bonus = 1.0 - min(abs(np.log(max(aspect, 1e-3))), 1.8) / 1.8
|
||||
vertex_bonus = 1.0 if 4 <= approx_vertices <= 8 else 0.55
|
||||
raw = 0.30 * blue_fill_ratio + 0.28 * rectangularity + 0.22 * solidity + 0.12 * aspect_bonus + 0.08 * vertex_bonus
|
||||
return float(max(0.0, min(1.0, raw)))
|
||||
|
||||
|
||||
def find_candidates_in_rgb(rgb: np.ndarray, cfg: Dict, sheet_id: str, source_path: str, xoff: int = 0, yoff: int = 0) -> List[Candidate]:
|
||||
det = cfg["detector"]
|
||||
mask = build_blue_mask(rgb, cfg)
|
||||
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
out: List[Candidate] = []
|
||||
|
||||
for contour in contours:
|
||||
area = float(cv2.contourArea(contour))
|
||||
if area < det["min_area_px"] or area > det["max_area_px"]:
|
||||
continue
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
if w < det["min_width_px"] or h < det["min_height_px"]:
|
||||
continue
|
||||
if w > det["max_width_px"] or h > det["max_height_px"]:
|
||||
continue
|
||||
aspect = float(w / max(h, 1))
|
||||
if not (det["min_aspect"] <= aspect <= det["max_aspect"]):
|
||||
continue
|
||||
|
||||
bbox_mask = mask[y:y + h, x:x + w]
|
||||
blue_fill_ratio = float(np.count_nonzero(bbox_mask) / max(w * h, 1))
|
||||
if not (det["min_blue_fill_ratio"] <= blue_fill_ratio <= det["max_blue_fill_ratio"]):
|
||||
continue
|
||||
|
||||
rectangularity = float(area / max(w * h, 1))
|
||||
if rectangularity < det["min_rectangularity"]:
|
||||
continue
|
||||
|
||||
hull = cv2.convexHull(contour)
|
||||
hull_area = float(cv2.contourArea(hull))
|
||||
solidity = float(area / hull_area) if hull_area > 0 else 0.0
|
||||
if solidity < det["min_solidity"]:
|
||||
continue
|
||||
|
||||
peri = float(cv2.arcLength(contour, True))
|
||||
approx = cv2.approxPolyDP(contour, 0.04 * peri, True)
|
||||
fill_style = _classify_fill_style(blue_fill_ratio, rectangularity, solidity)
|
||||
score = _score_candidate(blue_fill_ratio, rectangularity, solidity, aspect, len(approx))
|
||||
out.append(
|
||||
Candidate(
|
||||
sheet_id=sheet_id,
|
||||
source_path=source_path,
|
||||
x=int(x + xoff),
|
||||
y=int(y + yoff),
|
||||
w=int(w),
|
||||
h=int(h),
|
||||
cx=float(x + xoff + w / 2),
|
||||
cy=float(y + yoff + h / 2),
|
||||
area=area,
|
||||
aspect=aspect,
|
||||
blue_fill_ratio=blue_fill_ratio,
|
||||
rectangularity=rectangularity,
|
||||
solidity=solidity,
|
||||
approx_vertices=int(len(approx)),
|
||||
fill_style=fill_style,
|
||||
score=score,
|
||||
)
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def bbox_iou(a: Candidate, b: Candidate) -> float:
|
||||
ax1, ay1, ax2, ay2 = a.x, a.y, a.x + a.w, a.y + a.h
|
||||
bx1, by1, bx2, by2 = b.x, b.y, b.x + b.w, b.y + b.h
|
||||
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
||||
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
||||
iw, ih = max(0, ix2 - ix1), max(0, iy2 - iy1)
|
||||
inter = iw * ih
|
||||
union = a.w * a.h + b.w * b.h - inter
|
||||
return float(inter / union) if union else 0.0
|
||||
|
||||
|
||||
def nms(cands: Iterable[Candidate], threshold: float) -> List[Candidate]:
|
||||
items = sorted(cands, key=lambda c: c.score, reverse=True)
|
||||
kept: List[Candidate] = []
|
||||
for c in items:
|
||||
if all(bbox_iou(c, k) < threshold for k in kept):
|
||||
kept.append(c)
|
||||
return kept
|
||||
|
||||
|
||||
def detect_sheet(map_path: str | None, tif_path: str | None, sheet_id: str, cfg: Dict, out_dir: str | Path) -> Path:
|
||||
out_dir = ensure_dir(out_dir)
|
||||
rh = open_georaster(map_path=map_path, tif_path=tif_path)
|
||||
try:
|
||||
tile_size = int(cfg["detector"]["tile_size"])
|
||||
overlap = int(cfg["detector"]["tile_overlap"])
|
||||
all_candidates: List[Candidate] = []
|
||||
windows = list(iter_windows(rh.width, rh.height, tile_size, overlap))
|
||||
LOG.info("Scanning %s as %d windows (%dx%d px)", sheet_id, len(windows), rh.width, rh.height)
|
||||
for win in tqdm(windows, desc=f"scan {sheet_id}"):
|
||||
rgb = read_window_rgb(rh.dataset, win)
|
||||
cands = find_candidates_in_rgb(
|
||||
rgb=rgb,
|
||||
cfg=cfg,
|
||||
sheet_id=sheet_id,
|
||||
source_path=str(rh.path),
|
||||
xoff=int(win.col_off),
|
||||
yoff=int(win.row_off),
|
||||
)
|
||||
all_candidates.extend(cands)
|
||||
kept = nms(all_candidates, float(cfg["detector"].get("nms_iou_threshold", 0.25)))
|
||||
# Attach georeferenced centers when possible.
|
||||
for c in kept:
|
||||
if rh.crs is not None:
|
||||
xgeo, ygeo = pixel_to_lonlat(rh.dataset, int(c.cy), int(c.cx))
|
||||
c.lon = xgeo
|
||||
c.lat = ygeo
|
||||
out_csv = out_dir / f"{sheet_id}_candidates.csv"
|
||||
pd.DataFrame([c.to_dict() for c in kept]).to_csv(out_csv, index=False)
|
||||
LOG.info("%s: wrote %d candidates to %s", sheet_id, len(kept), out_csv)
|
||||
return out_csv
|
||||
finally:
|
||||
rh.close()
|
||||
|
||||
|
||||
def draw_overlay(tif_path: str | Path, candidates_csv: str | Path, out_png: str | Path, max_side: int = 2400) -> Path:
|
||||
"""Draw candidates on a downscaled image. Uses TIFF path for simple visual QA."""
|
||||
img = Image.open(tif_path).convert("RGB")
|
||||
scale = min(1.0, max_side / max(img.size))
|
||||
disp = img.resize((int(img.width * scale), int(img.height * scale))) if scale < 1 else img.copy()
|
||||
draw = ImageDraw.Draw(disp)
|
||||
df = pd.read_csv(candidates_csv)
|
||||
color_by_style = {
|
||||
"filled": (0, 255, 255),
|
||||
"hollow": (0, 120, 255),
|
||||
"border": (20, 20, 255),
|
||||
"unknown": (255, 255, 0),
|
||||
}
|
||||
for _, r in df.iterrows():
|
||||
color = color_by_style.get(str(r.get("fill_style", "unknown")), (255, 255, 0))
|
||||
x1, y1 = float(r.x) * scale, float(r.y) * scale
|
||||
x2, y2 = float(r.x + r.w) * scale, float(r.y + r.h) * scale
|
||||
draw.rectangle([x1, y1, x2, y2], outline=color, width=max(1, int(2 * scale + 1)))
|
||||
out_png = Path(out_png)
|
||||
ensure_dir(out_png.parent)
|
||||
disp.save(out_png)
|
||||
LOG.info("Wrote overlay: %s", out_png)
|
||||
return out_png
|
||||
117
bgtopo_poc/export_yolo.py
Normal file
117
bgtopo_poc/export_yolo.py
Normal file
@@ -0,0 +1,117 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import random
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
|
||||
from .utils import ensure_dir
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
STYLE_TO_CLASS = {
|
||||
"unknown": 0,
|
||||
"filled": 1,
|
||||
"hollow": 2,
|
||||
"border": 3,
|
||||
}
|
||||
|
||||
|
||||
def _crop_image_and_labels(img: Image.Image, boxes: pd.DataFrame, x0: int, y0: int, size: int):
|
||||
crop = img.crop((x0, y0, x0 + size, y0 + size)).convert("RGB")
|
||||
labels = []
|
||||
for _, r in boxes.iterrows():
|
||||
bx1, by1, bx2, by2 = float(r.x), float(r.y), float(r.x + r.w), float(r.y + r.h)
|
||||
ix1, iy1 = max(bx1, x0), max(by1, y0)
|
||||
ix2, iy2 = min(bx2, x0 + size), min(by2, y0 + size)
|
||||
if ix2 <= ix1 or iy2 <= iy1:
|
||||
continue
|
||||
visible_area = (ix2 - ix1) * (iy2 - iy1)
|
||||
box_area = max((bx2 - bx1) * (by2 - by1), 1)
|
||||
if visible_area / box_area < 0.35:
|
||||
continue
|
||||
cx = ((ix1 + ix2) / 2 - x0) / size
|
||||
cy = ((iy1 + iy2) / 2 - y0) / size
|
||||
w = (ix2 - ix1) / size
|
||||
h = (iy2 - iy1) / size
|
||||
cls = STYLE_TO_CLASS.get(str(r.get("fill_style", "unknown")), 0)
|
||||
labels.append(f"{cls} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}")
|
||||
return crop, labels
|
||||
|
||||
|
||||
def export_candidates_to_yolo(
|
||||
tif_path: str | Path,
|
||||
candidates_csv: str | Path,
|
||||
out_dir: str | Path,
|
||||
cfg: Dict,
|
||||
sheet_id: str,
|
||||
tile_size: int = 1024,
|
||||
overlap: int = 128,
|
||||
val_fraction: float = 0.20,
|
||||
include_empty_tiles: bool = True,
|
||||
max_empty_tiles: int = 250,
|
||||
) -> Path:
|
||||
out_dir = Path(out_dir)
|
||||
for split in ["train", "val"]:
|
||||
ensure_dir(out_dir / "images" / split)
|
||||
ensure_dir(out_dir / "labels" / split)
|
||||
|
||||
img = Image.open(tif_path).convert("RGB")
|
||||
boxes = pd.read_csv(candidates_csv)
|
||||
step = max(1, tile_size - overlap)
|
||||
random.seed(42)
|
||||
empty_written = 0
|
||||
total_written = 0
|
||||
|
||||
for y0 in range(0, max(1, img.height - tile_size + 1), step):
|
||||
for x0 in range(0, max(1, img.width - tile_size + 1), step):
|
||||
in_tile = boxes[
|
||||
(boxes.cx >= x0) & (boxes.cx < x0 + tile_size) &
|
||||
(boxes.cy >= y0) & (boxes.cy < y0 + tile_size)
|
||||
]
|
||||
if in_tile.empty:
|
||||
if not include_empty_tiles or empty_written >= max_empty_tiles:
|
||||
continue
|
||||
# Keep some empty/hard-negative tiles to stop the model from detecting all blue map details.
|
||||
if random.random() > 0.08:
|
||||
continue
|
||||
empty_written += 1
|
||||
crop, labels = _crop_image_and_labels(img, boxes, x0, y0, tile_size)
|
||||
split = "val" if random.random() < val_fraction else "train"
|
||||
stem = f"{sheet_id}_{x0}_{y0}"
|
||||
crop.save(out_dir / "images" / split / f"{stem}.jpg", quality=92)
|
||||
with open(out_dir / "labels" / split / f"{stem}.txt", "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(labels))
|
||||
total_written += 1
|
||||
|
||||
data_yaml = out_dir / "data.yaml"
|
||||
names = cfg.get("export", {}).get("yolo_class_names", ["blue_rect_unknown", "blue_rect_filled", "blue_rect_hollow", "blue_rect_border"])
|
||||
with open(data_yaml, "w", encoding="utf-8") as f:
|
||||
f.write(f"path: {out_dir.resolve()}\n")
|
||||
f.write("train: images/train\n")
|
||||
f.write("val: images/val\n")
|
||||
f.write("names:\n")
|
||||
for i, name in enumerate(names):
|
||||
f.write(f" {i}: {name}\n")
|
||||
LOG.info("YOLO export complete: %s (%d tiles)", data_yaml, total_written)
|
||||
return data_yaml
|
||||
|
||||
|
||||
def merge_yolo_datasets(src_dirs: list[str | Path], out_dir: str | Path) -> Path:
|
||||
out_dir = Path(out_dir)
|
||||
for split in ["train", "val"]:
|
||||
ensure_dir(out_dir / "images" / split)
|
||||
ensure_dir(out_dir / "labels" / split)
|
||||
for src in src_dirs:
|
||||
src = Path(src)
|
||||
for split in ["train", "val"]:
|
||||
for img in (src / "images" / split).glob("*.jpg"):
|
||||
shutil.copy2(img, out_dir / "images" / split / img.name)
|
||||
for lab in (src / "labels" / split).glob("*.txt"):
|
||||
shutil.copy2(lab, out_dir / "labels" / split / lab.name)
|
||||
return out_dir
|
||||
116
bgtopo_poc/georef.py
Normal file
116
bgtopo_poc/georef.py
Normal file
@@ -0,0 +1,116 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import rasterio
|
||||
from rasterio.errors import RasterioIOError
|
||||
from rasterio.transform import rowcol, xy
|
||||
from rasterio.windows import Window
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RasterHandle:
|
||||
path: Path
|
||||
dataset: rasterio.io.DatasetReader
|
||||
used_map_file: bool
|
||||
|
||||
@property
|
||||
def width(self) -> int:
|
||||
return self.dataset.width
|
||||
|
||||
@property
|
||||
def height(self) -> int:
|
||||
return self.dataset.height
|
||||
|
||||
@property
|
||||
def crs(self):
|
||||
return self.dataset.crs
|
||||
|
||||
@property
|
||||
def transform(self):
|
||||
return self.dataset.transform
|
||||
|
||||
def close(self) -> None:
|
||||
self.dataset.close()
|
||||
|
||||
|
||||
def open_georaster(map_path: Optional[str | Path] = None, tif_path: Optional[str | Path] = None) -> RasterHandle:
|
||||
"""Open .map if possible, otherwise .tif. .map gives georeferencing through GDAL's MAP driver."""
|
||||
last_err = None
|
||||
if map_path:
|
||||
try:
|
||||
p = Path(map_path)
|
||||
ds = rasterio.open(p)
|
||||
LOG.info("Opened georeferenced MAP dataset: %s", p)
|
||||
return RasterHandle(p, ds, used_map_file=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
last_err = e
|
||||
LOG.warning("Could not open MAP dataset %s: %s", map_path, e)
|
||||
if tif_path:
|
||||
try:
|
||||
p = Path(tif_path)
|
||||
ds = rasterio.open(p)
|
||||
LOG.info("Opened TIFF dataset: %s", p)
|
||||
return RasterHandle(p, ds, used_map_file=False)
|
||||
except RasterioIOError as e:
|
||||
last_err = e
|
||||
raise RuntimeError(f"Could not open raster. Last error: {last_err}")
|
||||
|
||||
|
||||
def read_window_rgb(ds: rasterio.io.DatasetReader, window: Window) -> np.ndarray:
|
||||
"""Read a raster window as uint8 RGB HxWx3."""
|
||||
arr = ds.read(window=window, boundless=True, fill_value=255)
|
||||
if arr.ndim != 3:
|
||||
raise ValueError(f"Expected band-first array, got shape={arr.shape}")
|
||||
if arr.shape[0] >= 3:
|
||||
arr = arr[:3]
|
||||
elif arr.shape[0] == 1:
|
||||
arr = np.repeat(arr, 3, axis=0)
|
||||
arr = np.moveaxis(arr, 0, -1)
|
||||
if arr.dtype != np.uint8:
|
||||
arr = np.clip(arr, 0, 255).astype(np.uint8)
|
||||
return arr
|
||||
|
||||
|
||||
def iter_windows(width: int, height: int, tile_size: int, overlap: int):
|
||||
step = max(1, tile_size - overlap)
|
||||
y = 0
|
||||
while y < height:
|
||||
x = 0
|
||||
h = min(tile_size, height - y)
|
||||
while x < width:
|
||||
w = min(tile_size, width - x)
|
||||
yield Window(x, y, w, h)
|
||||
if x + tile_size >= width:
|
||||
break
|
||||
x += step
|
||||
if y + tile_size >= height:
|
||||
break
|
||||
y += step
|
||||
|
||||
|
||||
def lonlat_to_pixel(ds: rasterio.io.DatasetReader, lon: float, lat: float) -> Tuple[int, int]:
|
||||
"""Convert lon/lat to row/col. Assumes the raster CRS accepts lon/lat or GDAL handles geographic transform.
|
||||
|
||||
For a production version, reproject coordinates into ds.crs with pyproj first. The PoC does that in coordinates.py.
|
||||
"""
|
||||
row, col = rowcol(ds.transform, lon, lat)
|
||||
return int(row), int(col)
|
||||
|
||||
|
||||
def pixel_to_lonlat(ds: rasterio.io.DatasetReader, row: int, col: int) -> Tuple[float, float]:
|
||||
x, y = xy(ds.transform, row, col, offset="center")
|
||||
return float(x), float(y)
|
||||
|
||||
|
||||
def has_real_georef(ds: rasterio.io.DatasetReader) -> bool:
|
||||
try:
|
||||
return ds.crs is not None and not ds.transform.is_identity
|
||||
except Exception:
|
||||
return False
|
||||
115
bgtopo_poc/inventory.py
Normal file
115
bgtopo_poc/inventory.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm
|
||||
|
||||
from .utils import ensure_dir
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SheetAsset:
|
||||
sheet_id: str
|
||||
map_url: Optional[str]
|
||||
tif_url: Optional[str]
|
||||
map_path: Optional[str] = None
|
||||
tif_path: Optional[str] = None
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
|
||||
"""Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
|
||||
LOG.info("Discovering assets from %s", base_url)
|
||||
html = requests.get(base_url, timeout=60).text
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]
|
||||
|
||||
by_sheet: dict[str, SheetAsset] = {}
|
||||
for href in hrefs:
|
||||
if href.startswith("?") or href.startswith("/") or href == "../":
|
||||
continue
|
||||
if not include_100k and "100k" in href.lower():
|
||||
continue
|
||||
if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
|
||||
continue
|
||||
sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
|
||||
item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
|
||||
full_url = urljoin(base_url, href)
|
||||
if href.lower().endswith(".map"):
|
||||
item.map_url = full_url
|
||||
else:
|
||||
item.tif_url = full_url
|
||||
|
||||
assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
|
||||
assets.sort(key=lambda x: x.sheet_id)
|
||||
LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
|
||||
return assets
|
||||
|
||||
|
||||
def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
|
||||
rows = [a.to_dict() for a in assets]
|
||||
out_csv = Path(out_csv)
|
||||
ensure_dir(out_csv.parent)
|
||||
pd.DataFrame(rows).to_csv(out_csv, index=False)
|
||||
LOG.info("Wrote manifest: %s", out_csv)
|
||||
return out_csv
|
||||
|
||||
|
||||
def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
|
||||
df = pd.read_csv(path).fillna("")
|
||||
assets: List[SheetAsset] = []
|
||||
for _, r in df.iterrows():
|
||||
assets.append(
|
||||
SheetAsset(
|
||||
sheet_id=str(r["sheet_id"]),
|
||||
map_url=str(r.get("map_url") or "") or None,
|
||||
tif_url=str(r.get("tif_url") or "") or None,
|
||||
map_path=str(r.get("map_path") or "") or None,
|
||||
tif_path=str(r.get("tif_path") or "") or None,
|
||||
)
|
||||
)
|
||||
return assets
|
||||
|
||||
|
||||
def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
|
||||
if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
|
||||
return out_path
|
||||
ensure_dir(out_path.parent)
|
||||
with requests.get(url, stream=True, timeout=120) as r:
|
||||
r.raise_for_status()
|
||||
total = int(r.headers.get("content-length", "0") or 0)
|
||||
with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
|
||||
for chunk in r.iter_content(chunk_size=1024 * 512):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
pbar.update(len(chunk))
|
||||
return out_path
|
||||
|
||||
|
||||
def download_assets(
|
||||
assets: List[SheetAsset],
|
||||
out_dir: str | Path,
|
||||
limit: Optional[int] = None,
|
||||
overwrite: bool = False,
|
||||
) -> List[SheetAsset]:
|
||||
out_dir = Path(out_dir)
|
||||
selected = assets[:limit] if limit else assets
|
||||
for item in selected:
|
||||
sheet_dir = out_dir / item.sheet_id
|
||||
if item.map_url:
|
||||
item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
|
||||
if item.tif_url:
|
||||
item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
|
||||
return selected
|
||||
80
bgtopo_poc/report.py
Normal file
80
bgtopo_poc/report.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from jinja2 import Template
|
||||
|
||||
from .utils import ensure_dir
|
||||
|
||||
REPORT_TEMPLATE = """
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>BGtopoVJ Blue Box PoC Report</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; max-width: 1200px; margin: 24px auto; line-height: 1.45; }
|
||||
table { border-collapse: collapse; width: 100%; margin: 16px 0; }
|
||||
th, td { border: 1px solid #ddd; padding: 8px; font-size: 14px; }
|
||||
th { background: #f3f3f3; text-align: left; }
|
||||
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 18px; }
|
||||
.card { border: 1px solid #ddd; border-radius: 12px; padding: 12px; }
|
||||
img { max-width: 100%; border: 1px solid #ccc; }
|
||||
code { background: #f6f6f6; padding: 2px 4px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>BGtopoVJ Blue Rectangle/Square PoC Report</h1>
|
||||
<p>This is a weak-label mining report. Treat candidates as review targets, not truth.</p>
|
||||
|
||||
<h2>Candidate summary</h2>
|
||||
<table>
|
||||
<tr><th>Metric</th><th>Value</th></tr>
|
||||
<tr><td>Total candidates</td><td>{{ total }}</td></tr>
|
||||
<tr><td>Average score</td><td>{{ avg_score }}</td></tr>
|
||||
<tr><td>Median score</td><td>{{ med_score }}</td></tr>
|
||||
</table>
|
||||
|
||||
<h2>By inferred fill style</h2>
|
||||
{{ style_table }}
|
||||
|
||||
<h2>QA overlays</h2>
|
||||
<div class="grid">
|
||||
{% for overlay in overlays %}
|
||||
<div class="card">
|
||||
<p><code>{{ overlay.name }}</code></p>
|
||||
<img src="{{ overlay.rel }}" />
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def build_report(candidate_csvs: list[str | Path], overlays: list[str | Path], out_html: str | Path) -> Path:
|
||||
frames = [pd.read_csv(p) for p in candidate_csvs if Path(p).exists()]
|
||||
df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
|
||||
out_html = Path(out_html)
|
||||
ensure_dir(out_html.parent)
|
||||
style_table = "<p>No candidates.</p>"
|
||||
if not df.empty:
|
||||
style_table = df.groupby("fill_style").agg(count=("fill_style", "size"), avg_score=("score", "mean")).reset_index().to_html(index=False)
|
||||
overlay_items = []
|
||||
for ov in overlays:
|
||||
ov = Path(ov)
|
||||
try:
|
||||
rel = ov.relative_to(out_html.parent)
|
||||
except ValueError:
|
||||
rel = ov
|
||||
overlay_items.append({"name": ov.name, "rel": str(rel).replace("\\", "/")})
|
||||
html = Template(REPORT_TEMPLATE).render(
|
||||
total=0 if df.empty else len(df),
|
||||
avg_score="—" if df.empty else f"{df['score'].mean():.3f}",
|
||||
med_score="—" if df.empty else f"{df['score'].median():.3f}",
|
||||
style_table=style_table,
|
||||
overlays=overlay_items,
|
||||
)
|
||||
out_html.write_text(html, encoding="utf-8")
|
||||
return out_html
|
||||
40
bgtopo_poc/train_yolo.py
Normal file
40
bgtopo_poc/train_yolo.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def train_yolo(data_yaml: str | Path, model: str = "yolov8s.pt", imgsz: int = 1024, epochs: int = 80, batch: int = 4, device: str = "0"):
|
||||
"""Train YOLO on the generated weak-label dataset.
|
||||
|
||||
This function imports ultralytics lazily so the rest of the PoC works without GPU dependencies.
|
||||
Review/correct the weak labels before treating this model as useful.
|
||||
"""
|
||||
from ultralytics import YOLO
|
||||
|
||||
yolo = YOLO(model)
|
||||
LOG.info("Starting YOLO training: model=%s data=%s imgsz=%d epochs=%d batch=%d device=%s", model, data_yaml, imgsz, epochs, batch, device)
|
||||
return yolo.train(
|
||||
data=str(data_yaml),
|
||||
imgsz=imgsz,
|
||||
epochs=epochs,
|
||||
batch=batch,
|
||||
device=device,
|
||||
workers=4,
|
||||
cache=False,
|
||||
patience=20,
|
||||
project="runs/bgtopo_bluebox",
|
||||
name=f"{Path(data_yaml).parent.name}_{Path(model).stem}",
|
||||
hsv_h=0.005,
|
||||
hsv_s=0.20,
|
||||
hsv_v=0.18,
|
||||
degrees=0.0,
|
||||
translate=0.05,
|
||||
scale=0.20,
|
||||
fliplr=0.5,
|
||||
flipud=0.5,
|
||||
mosaic=0.25,
|
||||
close_mosaic=15,
|
||||
)
|
||||
43
bgtopo_poc/utils.py
Normal file
43
bgtopo_poc/utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
def setup_logging(verbose: bool = False) -> None:
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if verbose else logging.INFO,
|
||||
format="%(asctime)s | %(levelname)-8s | %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
|
||||
def load_yaml(path: str | Path) -> Dict[str, Any]:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
|
||||
|
||||
def ensure_dir(path: str | Path) -> Path:
|
||||
p = Path(path)
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
return p
|
||||
|
||||
|
||||
def write_json(path: str | Path, data: Any) -> None:
|
||||
p = Path(path)
|
||||
ensure_dir(p.parent)
|
||||
with open(p, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def read_json(path: str | Path) -> Any:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def safe_stem(name: str) -> str:
|
||||
return Path(name).stem.replace("/", "_").replace("\\", "_")
|
||||
47
configs/blue_detector.yaml
Normal file
47
configs/blue_detector.yaml
Normal file
@@ -0,0 +1,47 @@
|
||||
# Tuned to be permissive. Expect false positives from rivers, lakes, names and contours.
|
||||
# The purpose is weak-label mining + hard-negative discovery, not final truth.
|
||||
source:
|
||||
base_url: "https://web.uni-plovdiv.net/vedrin/raster/original/"
|
||||
default_crs: "EPSG:4326"
|
||||
|
||||
detector:
|
||||
tile_size: 1400
|
||||
tile_overlap: 96
|
||||
min_width_px: 5
|
||||
min_height_px: 5
|
||||
max_width_px: 95
|
||||
max_height_px: 95
|
||||
min_area_px: 18
|
||||
max_area_px: 6500
|
||||
min_aspect: 0.35
|
||||
max_aspect: 4.25
|
||||
min_blue_fill_ratio: 0.045
|
||||
max_blue_fill_ratio: 0.92
|
||||
min_rectangularity: 0.18
|
||||
min_solidity: 0.22
|
||||
nms_iou_threshold: 0.25
|
||||
# OpenCV HSV hue range is 0-179, not 0-360.
|
||||
hsv_ranges:
|
||||
- name: blue_dark_to_light
|
||||
lower: [82, 28, 45]
|
||||
upper: [137, 255, 255]
|
||||
- name: pale_cyan_low_sat
|
||||
lower: [78, 10, 105]
|
||||
upper: [115, 115, 255]
|
||||
morphology:
|
||||
open_kernel: 2
|
||||
close_kernel: 3
|
||||
dilate_iterations: 1
|
||||
|
||||
coordinate_scoring:
|
||||
search_radius_px: 45
|
||||
strong_score: 0.90
|
||||
weak_score: 0.40
|
||||
crop_size_px: 256
|
||||
|
||||
export:
|
||||
yolo_class_names:
|
||||
- blue_rect_unknown
|
||||
- blue_rect_filled
|
||||
- blue_rect_hollow
|
||||
- blue_rect_border
|
||||
3
data/coordinates/example_points.csv
Normal file
3
data/coordinates/example_points.csv
Normal file
@@ -0,0 +1,3 @@
|
||||
id,lat,lon,expected
|
||||
example_001,42.58837223,23.19638729,unknown
|
||||
example_002,42.70000000,23.32000000,unknown
|
||||
|
18
docker-compose.gpu.yml
Normal file
18
docker-compose.gpu.yml
Normal file
@@ -0,0 +1,18 @@
|
||||
services:
|
||||
bgtopo-bluebox:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile.gpu
|
||||
image: bgtopo-bluebox-poc:gpu
|
||||
working_dir: /app
|
||||
volumes:
|
||||
- .:/app
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
stdin_open: true
|
||||
tty: true
|
||||
19
docker/Dockerfile.gpu
Normal file
19
docker/Dockerfile.gpu
Normal file
@@ -0,0 +1,19 @@
|
||||
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-venv git wget curl ca-certificates \
|
||||
gdal-bin libgdal-dev python3-gdal libgl1 libglib2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
RUN python3 -m pip install --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . /app
|
||||
ENV PYTHONPATH=/app
|
||||
CMD ["bash"]
|
||||
12
pyproject.toml
Normal file
12
pyproject.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[project]
|
||||
name = "bgtopo-bluebox-poc"
|
||||
version = "0.1.0"
|
||||
description = "PoC pipeline for detecting blue square/rectangle symbols in BGtopoVJ rasters and scoring coordinate points."
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["bgtopo_poc"]
|
||||
|
||||
[project.scripts]
|
||||
bgtopo-bluebox = "bgtopo_poc.cli:main"
|
||||
22
requirements.txt
Normal file
22
requirements.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
# Core GIS / raster stack
|
||||
rasterio>=1.3.9
|
||||
pyproj>=3.6.1
|
||||
shapely>=2.0.2
|
||||
geopandas>=0.14.4
|
||||
fiona>=1.9.6
|
||||
|
||||
# CV / ML
|
||||
opencv-python>=4.9.0
|
||||
numpy>=1.26.4
|
||||
pandas>=2.2.1
|
||||
Pillow>=10.2.0
|
||||
scikit-learn>=1.4.1
|
||||
ultralytics>=8.2.0
|
||||
PyYAML>=6.0.1
|
||||
tqdm>=4.66.2
|
||||
requests>=2.31.0
|
||||
beautifulsoup4>=4.12.3
|
||||
|
||||
# Reports / QA
|
||||
matplotlib>=3.8.3
|
||||
jinja2>=3.1.3
|
||||
49
scripts/run_pilot.sh
Normal file
49
scripts/run_pilot.sh
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Run from repository root.
|
||||
# First run downloads two sheets only. Increase --limit after confirming overlays look sane.
|
||||
|
||||
python -m bgtopo_poc.cli inventory \
|
||||
--config configs/blue_detector.yaml \
|
||||
--out data/manifest.csv \
|
||||
--limit 2
|
||||
|
||||
python -m bgtopo_poc.cli download \
|
||||
--manifest data/manifest.csv \
|
||||
--out-dir data/raw \
|
||||
--out-manifest data/manifest_downloaded.csv \
|
||||
--limit 2
|
||||
|
||||
python - <<'PY'
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
m = pd.read_csv('data/manifest_downloaded.csv')
|
||||
for _, r in m.iterrows():
|
||||
sid = r['sheet_id']
|
||||
Path('data/interim/candidates').mkdir(parents=True, exist_ok=True)
|
||||
print(f"Scanning {sid}")
|
||||
PY
|
||||
|
||||
while IFS=, read -r sheet_id map_url tif_url map_path tif_path; do
|
||||
if [[ "$sheet_id" == "sheet_id" ]]; then continue; fi
|
||||
python -m bgtopo_poc.cli detect \
|
||||
--config configs/blue_detector.yaml \
|
||||
--sheet-id "$sheet_id" \
|
||||
--map "$map_path" \
|
||||
--tif "$tif_path" \
|
||||
--out-dir data/interim/candidates
|
||||
|
||||
python -m bgtopo_poc.cli overlay \
|
||||
--tif "$tif_path" \
|
||||
--candidates "data/interim/candidates/${sheet_id}_candidates.csv" \
|
||||
--out "reports/overlays/${sheet_id}_overlay.png"
|
||||
|
||||
done < data/manifest_downloaded.csv
|
||||
|
||||
python -m bgtopo_poc.cli report \
|
||||
--candidates data/interim/candidates/*_candidates.csv \
|
||||
--overlays reports/overlays/*_overlay.png \
|
||||
--out reports/poc_report.html
|
||||
|
||||
echo "Open reports/poc_report.html"
|
||||
26
scripts/run_training_example.sh
Normal file
26
scripts/run_training_example.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Pick one downloaded sheet/candidate file first.
|
||||
SHEET_ID="${1:-K-34-009-2}"
|
||||
TIF="data/raw/${SHEET_ID}/${SHEET_ID}.tif"
|
||||
CAND="data/interim/candidates/${SHEET_ID}_candidates.csv"
|
||||
OUT="data/yolo/${SHEET_ID}"
|
||||
|
||||
python -m bgtopo_poc.cli export-yolo \
|
||||
--config configs/blue_detector.yaml \
|
||||
--sheet-id "$SHEET_ID" \
|
||||
--tif "$TIF" \
|
||||
--candidates "$CAND" \
|
||||
--out-dir "$OUT" \
|
||||
--tile-size 1024 \
|
||||
--overlap 128
|
||||
|
||||
# RTX 3080 FE 10 GB: start batch 2-4 at imgsz=1024. Raise only if VRAM allows it.
|
||||
python -m bgtopo_poc.cli train-yolo \
|
||||
--data-yaml "$OUT/data.yaml" \
|
||||
--model yolov8s.pt \
|
||||
--imgsz 1024 \
|
||||
--epochs 80 \
|
||||
--batch 4 \
|
||||
--device 0
|
||||
Reference in New Issue
Block a user