This commit is contained in:
nq
2026-05-03 21:58:47 +03:00
parent fd44235ff4
commit e45d1cb6b4
18 changed files with 1259 additions and 0 deletions

115
bgtopo_poc/inventory.py Normal file
View File

@@ -0,0 +1,115 @@
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Iterable, List, Optional
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from .utils import ensure_dir
LOG = logging.getLogger(__name__)
@dataclass
class SheetAsset:
sheet_id: str
map_url: Optional[str]
tif_url: Optional[str]
map_path: Optional[str] = None
tif_path: Optional[str] = None
def to_dict(self):
return asdict(self)
def discover_original_assets(base_url: str, include_100k: bool = False) -> List[SheetAsset]:
"""Discover .map/.tif pairs from the BGtopoVJ original raster directory listing."""
LOG.info("Discovering assets from %s", base_url)
html = requests.get(base_url, timeout=60).text
soup = BeautifulSoup(html, "html.parser")
hrefs = [a.get("href") for a in soup.find_all("a") if a.get("href")]
by_sheet: dict[str, SheetAsset] = {}
for href in hrefs:
if href.startswith("?") or href.startswith("/") or href == "../":
continue
if not include_100k and "100k" in href.lower():
continue
if not (href.lower().endswith(".map") or href.lower().endswith(".tif") or href.lower().endswith(".tiff")):
continue
sheet_id = re.sub(r"\.(map|tif|tiff)$", "", Path(href).name, flags=re.IGNORECASE)
item = by_sheet.setdefault(sheet_id, SheetAsset(sheet_id=sheet_id, map_url=None, tif_url=None))
full_url = urljoin(base_url, href)
if href.lower().endswith(".map"):
item.map_url = full_url
else:
item.tif_url = full_url
assets = [v for v in by_sheet.values() if v.map_url and v.tif_url]
assets.sort(key=lambda x: x.sheet_id)
LOG.info("Discovered %d complete .map/.tif pairs", len(assets))
return assets
def write_manifest_csv(assets: Iterable[SheetAsset], out_csv: str | Path) -> Path:
rows = [a.to_dict() for a in assets]
out_csv = Path(out_csv)
ensure_dir(out_csv.parent)
pd.DataFrame(rows).to_csv(out_csv, index=False)
LOG.info("Wrote manifest: %s", out_csv)
return out_csv
def read_manifest_csv(path: str | Path) -> List[SheetAsset]:
df = pd.read_csv(path).fillna("")
assets: List[SheetAsset] = []
for _, r in df.iterrows():
assets.append(
SheetAsset(
sheet_id=str(r["sheet_id"]),
map_url=str(r.get("map_url") or "") or None,
tif_url=str(r.get("tif_url") or "") or None,
map_path=str(r.get("map_path") or "") or None,
tif_path=str(r.get("tif_path") or "") or None,
)
)
return assets
def _download_one(url: str, out_path: Path, overwrite: bool = False) -> Path:
if out_path.exists() and out_path.stat().st_size > 0 and not overwrite:
return out_path
ensure_dir(out_path.parent)
with requests.get(url, stream=True, timeout=120) as r:
r.raise_for_status()
total = int(r.headers.get("content-length", "0") or 0)
with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
for chunk in r.iter_content(chunk_size=1024 * 512):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
return out_path
def download_assets(
assets: List[SheetAsset],
out_dir: str | Path,
limit: Optional[int] = None,
overwrite: bool = False,
) -> List[SheetAsset]:
out_dir = Path(out_dir)
selected = assets[:limit] if limit else assets
for item in selected:
sheet_dir = out_dir / item.sheet_id
if item.map_url:
item.map_path = str(_download_one(item.map_url, sheet_dir / f"{item.sheet_id}.map", overwrite=overwrite))
if item.tif_url:
item.tif_path = str(_download_one(item.tif_url, sheet_dir / f"{item.sheet_id}.tif", overwrite=overwrite))
return selected