garmin-img-format-parsing/bgtopo_poc/inventory.py

from __future__ import annotations

import logging
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Iterable, List, Optional
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

from .utils import ensure_dir

LOG = logging.getLogger(__name__)


@dataclass
class SheetAsset:
    sheet_id: str
    map_url: Optional[str]
    tif_url: Optional[str]
    map_path: Optional[str] = None
    tif_path: Optional[str] = None

    def to_dict(self):
        return asdict(self)


def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
    """Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
    LOG.info("Discovering assets from %s", base_url)
    html = requests.get(base_url, timeout=60).text
    soup = BeautifulSoup(html, "html.parser")
    hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]

    by_sheet: dict[str, SheetAsset] = {}
    for href in hrefs:
        if href.startswith("?") or href.startswith("/") or href == "../":
            continue
        if not include_100k and "100k" in href.lower():
            continue
        if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
            continue
        sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
        item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
        full_url = urljoin(base_url, href)
        if href.lower().endswith(".map"):
            item.map_url = full_url
        else:
            item.tif_url = full_url

    assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
    assets.sort(key=lambda x: x.sheet_id)
    LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
    return assets


def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
    rows = [a.to_dict() for a in assets]
    out_csv = Path(out_csv)
    ensure_dir(out_csv.parent)
    pd.DataFrame(rows).to_csv(out_csv, index=False)
    LOG.info("Wrote manifest: %s", out_csv)
    return out_csv


def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
    df = pd.read_csv(path).fillna("")
    assets: List[SheetAsset] = []
    for _, r in df.iterrows():
        assets.append(
            SheetAsset(
                sheet_id=str(r["sheet_id"]),
                map_url=str(r.get("map_url") or "") or None,
                tif_url=str(r.get("tif_url") or "") or None,
                map_path=str(r.get("map_path") or "") or None,
                tif_path=str(r.get("tif_path") or "") or None,
            )
        )
    return assets


def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
    if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
        return out_path
    ensure_dir(out_path.parent)
    with requests.get(url, stream=True, timeout=120) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", "0") or 0)
        with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
            for chunk in r.iter_content(chunk_size=1024 * 512):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
    return out_path


def download_assets(
    assets: List[SheetAsset],
    out_dir: str | Path,
    limit: Optional[int] = None,
    overwrite: bool = False,
) -> List[SheetAsset]:
    out_dir = Path(out_dir)
    selected = assets[:limit] if limit else assets
    for item in selected:
        sheet_dir = out_dir / item.sheet_id
        if item.map_url:
            item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
        if item.tif_url:
            item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
    return selected