116 lines
4.0 KiB
Python
116 lines
4.0 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, asdict
|
|
from pathlib import Path
|
|
from typing import Iterable, List, Optional
|
|
from urllib.parse import urljoin
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from tqdm import tqdm
|
|
|
|
from .utils import ensure_dir
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class SheetAsset:
|
|
sheet_id: str
|
|
map_url: Optional[str]
|
|
tif_url: Optional[str]
|
|
map_path: Optional[str] = None
|
|
tif_path: Optional[str] = None
|
|
|
|
def to_dict(self):
|
|
return asdict(self)
|
|
|
|
|
|
def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
|
|
"""Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
|
|
LOG.info("Discovering assets from %s", base_url)
|
|
html = requests.get(base_url, timeout=60).text
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]
|
|
|
|
by_sheet: dict[str, SheetAsset] = {}
|
|
for href in hrefs:
|
|
if href.startswith("?") or href.startswith("/") or href == "../":
|
|
continue
|
|
if not include_100k and "100k" in href.lower():
|
|
continue
|
|
if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
|
|
continue
|
|
sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
|
|
item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
|
|
full_url = urljoin(base_url, href)
|
|
if href.lower().endswith(".map"):
|
|
item.map_url = full_url
|
|
else:
|
|
item.tif_url = full_url
|
|
|
|
assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
|
|
assets.sort(key=lambda x: x.sheet_id)
|
|
LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
|
|
return assets
|
|
|
|
|
|
def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
|
|
rows = [a.to_dict() for a in assets]
|
|
out_csv = Path(out_csv)
|
|
ensure_dir(out_csv.parent)
|
|
pd.DataFrame(rows).to_csv(out_csv, index=False)
|
|
LOG.info("Wrote manifest: %s", out_csv)
|
|
return out_csv
|
|
|
|
|
|
def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
|
|
df = pd.read_csv(path).fillna("")
|
|
assets: List[SheetAsset] = []
|
|
for _, r in df.iterrows():
|
|
assets.append(
|
|
SheetAsset(
|
|
sheet_id=str(r["sheet_id"]),
|
|
map_url=str(r.get("map_url") or "") or None,
|
|
tif_url=str(r.get("tif_url") or "") or None,
|
|
map_path=str(r.get("map_path") or "") or None,
|
|
tif_path=str(r.get("tif_path") or "") or None,
|
|
)
|
|
)
|
|
return assets
|
|
|
|
|
|
def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
|
|
if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
|
|
return out_path
|
|
ensure_dir(out_path.parent)
|
|
with requests.get(url, stream=True, timeout=120) as r:
|
|
r.raise_for_status()
|
|
total = int(r.headers.get("content-length", "0") or 0)
|
|
with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
|
|
for chunk in r.iter_content(chunk_size=1024 * 512):
|
|
if chunk:
|
|
f.write(chunk)
|
|
pbar.update(len(chunk))
|
|
return out_path
|
|
|
|
|
|
def download_assets(
|
|
assets: List[SheetAsset],
|
|
out_dir: str | Path,
|
|
limit: Optional[int] = None,
|
|
overwrite: bool = False,
|
|
) -> List[SheetAsset]:
|
|
out_dir = Path(out_dir)
|
|
selected = assets[:limit] if limit else assets
|
|
for item in selected:
|
|
sheet_dir = out_dir / item.sheet_id
|
|
if item.map_url:
|
|
item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
|
|
if item.tif_url:
|
|
item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
|
|
return selected
|