stage-2-v3-configurable-v2
This commit is contained in:
@@ -282,6 +282,8 @@ def infer_style_and_name(it: Landmark, locale: str) -> dict[str, Any]:
|
|||||||
style["label"] = "Селища"
|
style["label"] = "Селища"
|
||||||
else:
|
else:
|
||||||
style["label"] = humanize_key(it.point_group_key or it.raw_type_key())
|
style["label"] = humanize_key(it.point_group_key or it.raw_type_key())
|
||||||
|
style["icon"] = "marker"
|
||||||
|
style["color"] = "#FB8C00"
|
||||||
return {
|
return {
|
||||||
"semantic_class": sem_class,
|
"semantic_class": sem_class,
|
||||||
"base_name": base,
|
"base_name": base,
|
||||||
@@ -358,6 +360,18 @@ def analyze_to_config(items: list[Landmark], example_count: int, locale_hint: Op
|
|||||||
for k, v in m.semantic_tags.items():
|
for k, v in m.semantic_tags.items():
|
||||||
if k != "name" and v:
|
if k != "name" and v:
|
||||||
sem_pairs[(k, v)] += 1
|
sem_pairs[(k, v)] += 1
|
||||||
|
|
||||||
|
member_count = max(1, len(members))
|
||||||
|
semantic_required_any = []
|
||||||
|
semantic_preferred_any = []
|
||||||
|
for (k, v), c in sem_pairs.most_common(5):
|
||||||
|
pair = {k: v}
|
||||||
|
coverage = c / member_count
|
||||||
|
if coverage >= 0.80:
|
||||||
|
semantic_required_any.append(pair)
|
||||||
|
else:
|
||||||
|
semantic_preferred_any.append(pair)
|
||||||
|
|
||||||
group_id = re.sub(r"[^a-zA-Z0-9_]+", "_", key).strip("_").lower() or "group"
|
group_id = re.sub(r"[^a-zA-Z0-9_]+", "_", key).strip("_").lower() or "group"
|
||||||
groups_cfg.append({
|
groups_cfg.append({
|
||||||
"id": group_id,
|
"id": group_id,
|
||||||
@@ -366,7 +380,8 @@ def analyze_to_config(items: list[Landmark], example_count: int, locale_hint: Op
|
|||||||
"match": {
|
"match": {
|
||||||
"point_group_keys": [key],
|
"point_group_keys": [key],
|
||||||
"raw_types": [rt for rt, _ in raw_types.most_common(5)],
|
"raw_types": [rt for rt, _ in raw_types.most_common(5)],
|
||||||
"semantic_required_any": [{k:v} for (k,v), _ in sem_pairs.most_common(5)],
|
"semantic_required_any": semantic_required_any,
|
||||||
|
"semantic_preferred_any": semantic_preferred_any,
|
||||||
"gpxsee_classes_any": sorted({c for m in members for c in m.gpxsee_classes})[:8],
|
"gpxsee_classes_any": sorted({c for m in members for c in m.gpxsee_classes})[:8],
|
||||||
"has_name": None,
|
"has_name": None,
|
||||||
"has_image": None,
|
"has_image": None,
|
||||||
@@ -386,6 +401,7 @@ def analyze_to_config(items: list[Landmark], example_count: int, locale_hint: Op
|
|||||||
"image_count": sum(1 for m in members if m.garmin_image_id),
|
"image_count": sum(1 for m in members if m.garmin_image_id),
|
||||||
"raw_type_counts": dict(raw_types.most_common(10)),
|
"raw_type_counts": dict(raw_types.most_common(10)),
|
||||||
"semantic_tag_counts": {f"{k}={v}": c for (k,v), c in sem_pairs.most_common(10)},
|
"semantic_tag_counts": {f"{k}={v}": c for (k,v), c in sem_pairs.most_common(10)},
|
||||||
|
"semantic_tag_coverages": {f"{k}={v}": round(c / member_count, 6) for (k,v), c in sem_pairs.most_common(10)},
|
||||||
},
|
},
|
||||||
"examples": spread_examples(members, example_count),
|
"examples": spread_examples(members, example_count),
|
||||||
})
|
})
|
||||||
@@ -426,6 +442,31 @@ def load_json(path: Path) -> Any:
|
|||||||
return json.load(f)
|
return json.load(f)
|
||||||
return json.loads(path.read_text(encoding="utf-8"))
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
def migrate_config(config: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""Normalize older autoconfigs that accidentally promoted sparse semantic
|
||||||
|
tags into required match constraints for raw symbol groups."""
|
||||||
|
cfg = json.loads(json.dumps(config, ensure_ascii=False))
|
||||||
|
for g in cfg.get("groups") or []:
|
||||||
|
match = g.setdefault("match", {})
|
||||||
|
sem_req = list(match.get("semantic_required_any") or [])
|
||||||
|
coverages = dict((g.get("stats") or {}).get("semantic_tag_coverages") or {})
|
||||||
|
if sem_req and (match.get("raw_types") or match.get("point_group_keys")):
|
||||||
|
keep_req = []
|
||||||
|
preferred = list(match.get("semantic_preferred_any") or [])
|
||||||
|
for kv in sem_req:
|
||||||
|
if isinstance(kv, dict) and len(kv) == 1:
|
||||||
|
(k, v), = kv.items()
|
||||||
|
cov = coverages.get(f"{k}={v}")
|
||||||
|
if cov is not None and float(cov) < 0.80:
|
||||||
|
preferred.append(kv)
|
||||||
|
else:
|
||||||
|
keep_req.append(kv)
|
||||||
|
else:
|
||||||
|
keep_req.append(kv)
|
||||||
|
match["semantic_required_any"] = keep_req
|
||||||
|
match["semantic_preferred_any"] = preferred
|
||||||
|
return cfg
|
||||||
|
|
||||||
def has_cyrillic(text: str) -> bool:
|
def has_cyrillic(text: str) -> bool:
|
||||||
return any(0x0400 <= ord(ch) <= 0x04FF for ch in text)
|
return any(0x0400 <= ord(ch) <= 0x04FF for ch in text)
|
||||||
|
|
||||||
@@ -508,11 +549,13 @@ def render_name(it: Landmark, group: dict[str, Any], config: dict[str, Any]) ->
|
|||||||
return sanitize_text(template.format(base_name=base, hash=short_hash, existing_name=existing))
|
return sanitize_text(template.format(base_name=base, hash=short_hash, existing_name=existing))
|
||||||
return existing or sanitize_text(template.format(base_name=base, hash=short_hash, existing_name=existing))
|
return existing or sanitize_text(template.format(base_name=base, hash=short_hash, existing_name=existing))
|
||||||
|
|
||||||
def build_records(items: list[Landmark], config: dict[str, Any]) -> list[dict[str, Any]]:
|
def build_records(items: list[Landmark], config: dict[str, Any]) -> tuple[list[dict[str, Any]], list[Landmark]]:
|
||||||
out = []
|
out = []
|
||||||
|
unmatched: list[Landmark] = []
|
||||||
for it in items:
|
for it in items:
|
||||||
group = assign_group(it, config)
|
group = assign_group(it, config)
|
||||||
if group is None:
|
if group is None:
|
||||||
|
unmatched.append(it)
|
||||||
continue
|
continue
|
||||||
display = group.get("display") or {}
|
display = group.get("display") or {}
|
||||||
final_name = render_name(it, group, config)
|
final_name = render_name(it, group, config)
|
||||||
@@ -548,7 +591,29 @@ def build_records(items: list[Landmark], config: dict[str, Any]) -> list[dict[st
|
|||||||
"source_files": list(it.source_files),
|
"source_files": list(it.source_files),
|
||||||
"duplicate_count": it.duplicate_count,
|
"duplicate_count": it.duplicate_count,
|
||||||
})
|
})
|
||||||
return out
|
return out, unmatched
|
||||||
|
|
||||||
|
def write_unmatched_csv(items: list[Landmark], path: Path):
|
||||||
|
fields = ["mapset","lon","lat","name","garmin_kind","garmin_type","garmin_subtype","point_group_key","garmin_image_id","gpxsee_classes_json","semantic_tags_json","point_interest_reasons_json","source_files_json"]
|
||||||
|
with open_text(path, "w") as f:
|
||||||
|
w = csv.DictWriter(f, fieldnames=fields)
|
||||||
|
w.writeheader()
|
||||||
|
for it in items:
|
||||||
|
w.writerow({
|
||||||
|
"mapset": it.mapset,
|
||||||
|
"lon": f"{it.lon:.8f}",
|
||||||
|
"lat": f"{it.lat:.8f}",
|
||||||
|
"name": sanitize_text(it.name),
|
||||||
|
"garmin_kind": it.garmin_kind,
|
||||||
|
"garmin_type": it.garmin_type,
|
||||||
|
"garmin_subtype": it.garmin_subtype,
|
||||||
|
"point_group_key": it.point_group_key,
|
||||||
|
"garmin_image_id": it.garmin_image_id,
|
||||||
|
"gpxsee_classes_json": json.dumps(it.gpxsee_classes, ensure_ascii=False),
|
||||||
|
"semantic_tags_json": json.dumps(it.semantic_tags, ensure_ascii=False, sort_keys=True),
|
||||||
|
"point_interest_reasons_json": json.dumps(it.point_interest_reasons, ensure_ascii=False),
|
||||||
|
"source_files_json": json.dumps(it.source_files, ensure_ascii=False),
|
||||||
|
})
|
||||||
|
|
||||||
def write_resolved_csv(records: list[dict[str, Any]], path: Path):
|
def write_resolved_csv(records: list[dict[str, Any]], path: Path):
|
||||||
fields = ["group_id","group_label","lon","lat","name","short_hash","icon","color","background","duplicate_count","source_files_json","props_json"]
|
fields = ["group_id","group_label","lon","lat","name","short_hash","icon","color","background","duplicate_count","source_files_json","props_json"]
|
||||||
@@ -651,6 +716,7 @@ def main(argv=None):
|
|||||||
b.add_argument("--osm", type=Path)
|
b.add_argument("--osm", type=Path)
|
||||||
b.add_argument("--gpx", type=Path)
|
b.add_argument("--gpx", type=Path)
|
||||||
b.add_argument("--summary-json", type=Path)
|
b.add_argument("--summary-json", type=Path)
|
||||||
|
b.add_argument("--unmatched-csv", type=Path, help="Write unmatched input landmarks for debugging config misses")
|
||||||
b.add_argument("--dedupe-mode", choices=["coord","coord_name","coord_type","coord_name_type","off"])
|
b.add_argument("--dedupe-mode", choices=["coord","coord_name","coord_type","coord_name_type","off"])
|
||||||
b.add_argument("--dedupe-radius-m", type=float)
|
b.add_argument("--dedupe-radius-m", type=float)
|
||||||
args = ap.parse_args(argv)
|
args = ap.parse_args(argv)
|
||||||
@@ -673,15 +739,16 @@ def main(argv=None):
|
|||||||
print(f"[info] wrote summary: {args.summary_json}")
|
print(f"[info] wrote summary: {args.summary_json}")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
cfg = load_json(args.config)
|
cfg = migrate_config(load_json(args.config))
|
||||||
defaults = cfg.get("defaults") or {}
|
defaults = cfg.get("defaults") or {}
|
||||||
dedupe_cfg = defaults.get("dedupe") or {}
|
dedupe_cfg = defaults.get("dedupe") or {}
|
||||||
mode = args.dedupe_mode or dedupe_cfg.get("mode", "coord")
|
mode = args.dedupe_mode or dedupe_cfg.get("mode", "coord")
|
||||||
radius = args.dedupe_radius_m if args.dedupe_radius_m is not None else float(dedupe_cfg.get("radius_m", 12.0))
|
radius = args.dedupe_radius_m if args.dedupe_radius_m is not None else float(dedupe_cfg.get("radius_m", 12.0))
|
||||||
items = dedupe(items, radius_m=radius, mode=mode)
|
items = dedupe(items, radius_m=radius, mode=mode)
|
||||||
records = build_records(items, cfg)
|
records, unmatched = build_records(items, cfg)
|
||||||
if not any([args.resolved_csv, args.osm, args.gpx, args.summary_json]):
|
if not any([args.resolved_csv, args.osm, args.gpx, args.summary_json, args.unmatched_csv]):
|
||||||
print(f"[info] built {len(records)} records")
|
print(f"[info] built {len(records)} records")
|
||||||
|
print(f"[info] unmatched {len(unmatched)} landmarks")
|
||||||
return 0
|
return 0
|
||||||
if args.resolved_csv:
|
if args.resolved_csv:
|
||||||
write_resolved_csv(records, args.resolved_csv)
|
write_resolved_csv(records, args.resolved_csv)
|
||||||
@@ -692,10 +759,14 @@ def main(argv=None):
|
|||||||
if args.gpx:
|
if args.gpx:
|
||||||
write_gpx(records, args.gpx)
|
write_gpx(records, args.gpx)
|
||||||
print(f"[info] wrote GPX: {args.gpx}")
|
print(f"[info] wrote GPX: {args.gpx}")
|
||||||
|
if args.unmatched_csv:
|
||||||
|
write_unmatched_csv(unmatched, args.unmatched_csv)
|
||||||
|
print(f"[info] wrote unmatched CSV: {args.unmatched_csv}")
|
||||||
if args.summary_json:
|
if args.summary_json:
|
||||||
c = Counter(r["group_id"] for r in records)
|
c = Counter(r["group_id"] for r in records)
|
||||||
save_json(args.summary_json, {
|
save_json(args.summary_json, {
|
||||||
"record_count": len(records),
|
"record_count": len(records),
|
||||||
|
"unmatched_count": len(unmatched),
|
||||||
"groups": dict(c.most_common()),
|
"groups": dict(c.most_common()),
|
||||||
"dedupe_mode": mode,
|
"dedupe_mode": mode,
|
||||||
"dedupe_radius_m": radius,
|
"dedupe_radius_m": radius,
|
||||||
|
|||||||
Reference in New Issue
Block a user