rnd-v1

2026-05-03 21:58:47 +03:00
parent fd44235ff4
commit e45d1cb6b4
18 changed files with 1259 additions and 0 deletions
--- a/bgtopo_poc/inventory.py
+++ b/bgtopo_poc/inventory.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Iterable, List, Optional
+from urllib.parse import urljoin
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+from .utils import ensure_dir
+
+LOG = logging.getLogger(__name__)
+
+
+@dataclass
+class SheetAsset:
+    sheet_id: str
+    map_url: Optional[str]
+    tif_url: Optional[str]
+    map_path: Optional[str] = None
+    tif_path: Optional[str] = None
+
+    def to_dict(self):
+        return asdict(self)
+
+
+def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
+    """Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
+    LOG.info("Discovering assets from %s", base_url)
+    html = requests.get(base_url, timeout=60).text
+    soup = BeautifulSoup(html, "html.parser")
+    hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]
+
+    by_sheet: dict[str, SheetAsset] = {}
+    for href in hrefs:
+        if href.startswith("?") or href.startswith("/") or href == "../":
+            continue
+        if not include_100k and "100k" in href.lower():
+            continue
+        if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
+            continue
+        sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
+        item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
+        full_url = urljoin(base_url, href)
+        if href.lower().endswith(".map"):
+            item.map_url = full_url
+        else:
+            item.tif_url = full_url
+
+    assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
+    assets.sort(key=lambda x: x.sheet_id)
+    LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
+    return assets
+
+
+def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
+    rows = [a.to_dict() for a in assets]
+    out_csv = Path(out_csv)
+    ensure_dir(out_csv.parent)
+    pd.DataFrame(rows).to_csv(out_csv, index=False)
+    LOG.info("Wrote manifest: %s", out_csv)
+    return out_csv
+
+
+def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
+    df = pd.read_csv(path).fillna("")
+    assets: List[SheetAsset] = []
+    for _, r in df.iterrows():
+        assets.append(
+            SheetAsset(
+                sheet_id=str(r["sheet_id"]),
+                map_url=str(r.get("map_url") or "") or None,
+                tif_url=str(r.get("tif_url") or "") or None,
+                map_path=str(r.get("map_path") or "") or None,
+                tif_path=str(r.get("tif_path") or "") or None,
+            )
+        )
+    return assets
+
+
+def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
+    if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
+        return out_path
+    ensure_dir(out_path.parent)
+    with requests.get(url, stream=True, timeout=120) as r:
+        r.raise_for_status()
+        total = int(r.headers.get("content-length", "0") or 0)
+        with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
+            for chunk in r.iter_content(chunk_size=1024 * 512):
+                if chunk:
+                    f.write(chunk)
+                    pbar.update(len(chunk))
+    return out_path
+
+
+def download_assets(
+    assets: List[SheetAsset],
+    out_dir: str | Path,
+    limit: Optional[int] = None,
+    overwrite: bool = False,
+) -> List[SheetAsset]:
+    out_dir = Path(out_dir)
+    selected = assets[:limit] if limit else assets
+    for item in selected:
+        sheet_dir = out_dir / item.sheet_id
+        if item.map_url:
+            item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
+        if item.tif_url:
+            item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
+    return selected