from __future__ import annotations import logging import re from dataclasses import dataclass, asdict from pathlib import Path from typing import Iterable, List, Optional from urllib.parse import urljoin import pandas as pd import requests from bs4 import BeautifulSoup from tqdm import tqdm from .utils import ensure_dir LOG = logging.getLogger(__name__) @dataclass class SheetAsset: sheet_id: str map_url: Optional[str] tif_url: Optional[str] map_path: Optional[str] = None tif_path: Optional[str] = None def to_dict(self): return asdict(self) def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]: """Discover .map/.tif pairs from the BGtopoVJ original raster directory listing.""" LOG.info("Discovering assets from %s", base_url) html = requests.get(base_url, timeout=60).text soup = BeautifulSoup(html, "html.parser") hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")] by_sheet: dict[str, SheetAsset] = {} for href in hrefs: if href.startswith("?") or href.startswith("/") or href == "../": continue if not include_100k and "100k" in href.lower(): continue if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")): continue sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE) item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None)) full_url = urljoin(base_url, href) if href.lower().endswith(".map"): item.map_url = full_url else: item.tif_url = full_url assets = [v for v in by_sheet.values() if v.map_url and v.tif_url] assets.sort(key=lambda x: x.sheet_id) LOG.info("Discovered %d complete .map/.tif pairs", len(assets)) return assets def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path: rows = [a.to_dict() for a in assets] out_csv = Path(out_csv) ensure_dir(out_csv.parent) pd.DataFrame(rows).to_csv(out_csv, index=False) LOG.info("Wrote manifest: %s", out_csv) return out_csv def read_manifest_csv(path: str | Path) -> List[SheetAsset]: df = pd.read_csv(path).fillna("") assets: List[SheetAsset] = [] for _, r in df.iterrows(): assets.append( SheetAsset( sheet_id=str(r["sheet_id"]), map_url=str(r.get("map_url") or "") or None, tif_url=str(r.get("tif_url") or "") or None, map_path=str(r.get("map_path") or "") or None, tif_path=str(r.get("tif_path") or "") or None, ) ) return assets def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path: if out_path.exists() and out_path.stat().st_size > 0 and not overwrite: return out_path ensure_dir(out_path.parent) with requests.get(url, stream=True, timeout=120) as r: r.raise_for_status() total = int(r.headers.get("content-length", "0") or 0) with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar: for chunk in r.iter_content(chunk_size=1024 * 512): if chunk: f.write(chunk) pbar.update(len(chunk)) return out_path def download_assets( assets: List[SheetAsset], out_dir: str | Path, limit: Optional[int] = None, overwrite: bool = False, ) -> List[SheetAsset]: out_dir = Path(out_dir) selected = assets[:limit] if limit else assets for item in selected: sheet_dir = out_dir / item.sheet_id if item.map_url: item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite)) if item.tif_url: item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite)) return selected