rnd-v1
This commit is contained in:
115
bgtopo_poc/inventory.py
Normal file
115
bgtopo_poc/inventory.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm
|
||||
|
||||
from .utils import ensure_dir
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SheetAsset:
|
||||
sheet_id: str
|
||||
map_url: Optional[str]
|
||||
tif_url: Optional[str]
|
||||
map_path: Optional[str] = None
|
||||
tif_path: Optional[str] = None
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
|
||||
"""Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
|
||||
LOG.info("Discovering assets from %s", base_url)
|
||||
html = requests.get(base_url, timeout=60).text
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]
|
||||
|
||||
by_sheet: dict[str, SheetAsset] = {}
|
||||
for href in hrefs:
|
||||
if href.startswith("?") or href.startswith("/") or href == "../":
|
||||
continue
|
||||
if not include_100k and "100k" in href.lower():
|
||||
continue
|
||||
if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
|
||||
continue
|
||||
sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
|
||||
item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
|
||||
full_url = urljoin(base_url, href)
|
||||
if href.lower().endswith(".map"):
|
||||
item.map_url = full_url
|
||||
else:
|
||||
item.tif_url = full_url
|
||||
|
||||
assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
|
||||
assets.sort(key=lambda x: x.sheet_id)
|
||||
LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
|
||||
return assets
|
||||
|
||||
|
||||
def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
|
||||
rows = [a.to_dict() for a in assets]
|
||||
out_csv = Path(out_csv)
|
||||
ensure_dir(out_csv.parent)
|
||||
pd.DataFrame(rows).to_csv(out_csv, index=False)
|
||||
LOG.info("Wrote manifest: %s", out_csv)
|
||||
return out_csv
|
||||
|
||||
|
||||
def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
|
||||
df = pd.read_csv(path).fillna("")
|
||||
assets: List[SheetAsset] = []
|
||||
for _, r in df.iterrows():
|
||||
assets.append(
|
||||
SheetAsset(
|
||||
sheet_id=str(r["sheet_id"]),
|
||||
map_url=str(r.get("map_url") or "") or None,
|
||||
tif_url=str(r.get("tif_url") or "") or None,
|
||||
map_path=str(r.get("map_path") or "") or None,
|
||||
tif_path=str(r.get("tif_path") or "") or None,
|
||||
)
|
||||
)
|
||||
return assets
|
||||
|
||||
|
||||
def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
|
||||
if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
|
||||
return out_path
|
||||
ensure_dir(out_path.parent)
|
||||
with requests.get(url, stream=True, timeout=120) as r:
|
||||
r.raise_for_status()
|
||||
total = int(r.headers.get("content-length", "0") or 0)
|
||||
with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
|
||||
for chunk in r.iter_content(chunk_size=1024 * 512):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
pbar.update(len(chunk))
|
||||
return out_path
|
||||
|
||||
|
||||
def download_assets(
|
||||
assets: List[SheetAsset],
|
||||
out_dir: str | Path,
|
||||
limit: Optional[int] = None,
|
||||
overwrite: bool = False,
|
||||
) -> List[SheetAsset]:
|
||||
out_dir = Path(out_dir)
|
||||
selected = assets[:limit] if limit else assets
|
||||
for item in selected:
|
||||
sheet_dir = out_dir / item.sheet_id
|
||||
if item.map_url:
|
||||
item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
|
||||
if item.tif_url:
|
||||
item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
|
||||
return selected
|
||||
Reference in New Issue
Block a user