Files
nuzlocke-tracker/scripts/fetch_badges.py

147 lines
4.6 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""One-time script to download badge images from Bulbapedia."""
import json
import re
import subprocess
import sys
from pathlib import Path
BADGES_DIR = Path(__file__).resolve().parent.parent / "frontend" / "public" / "badges"
SEEDS_DIR = (
Path(__file__).resolve().parent.parent
/ "backend"
/ "src"
/ "app"
/ "seeds"
/ "data"
)
MEDIAWIKI_API = "https://archives.bulbagarden.net/w/api.php"
def get_referenced_badges() -> set[str]:
"""Extract all unique non-null badge_image_url from seed files."""
badges = set()
for f in SEEDS_DIR.glob("*-bosses.json"):
data = json.loads(f.read_text())
for boss in data:
url = boss.get("badge_image_url")
if url:
badges.add(url)
return badges
def get_missing_badges() -> list[str]:
"""Return badge paths that are referenced but don't exist on disk."""
referenced = get_referenced_badges()
missing = []
for badge_path in sorted(referenced):
full_path = BADGES_DIR / Path(badge_path).name
if not full_path.exists():
missing.append(badge_path)
return missing
def badge_path_to_bulbapedia_filename(badge_path: str) -> str:
"""Convert /badges/coal-badge.png -> Coal_Badge.png"""
name = Path(badge_path).stem # e.g. "coal-badge"
parts = name.split("-") # ["coal", "badge"]
title_parts = [p.capitalize() for p in parts]
return "_".join(title_parts) + ".png"
def resolve_image_urls(filenames: list[str]) -> dict[str, str | None]:
"""Use MediaWiki API to resolve image filenames to direct URLs."""
results = {}
# Process in batches of 50
for i in range(0, len(filenames), 50):
batch = filenames[i : i + 50]
titles = "|".join(f"File:{fn}" for fn in batch)
cmd = [
"curl",
"-s",
f"{MEDIAWIKI_API}?action=query&titles={titles}"
"&prop=imageinfo&iiprop=url&format=json",
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
data = json.loads(result.stdout)
# Build normalization map (API normalizes underscores to spaces)
norm_map = {}
for entry in data.get("query", {}).get("normalized", []):
norm_map[entry["to"]] = entry["from"]
pages = data.get("query", {}).get("pages", {})
for page in pages.values():
title = page.get("title", "").replace("File:", "")
# Map back to original underscore form
original = norm_map.get(f"File:{title}", f"File:{title}").replace(
"File:", ""
)
imageinfo = page.get("imageinfo", [])
if imageinfo:
results[original] = imageinfo[0]["url"]
else:
results[original] = None
return results
def download_file(url: str, dest: Path) -> bool:
"""Download a file using curl."""
dest.parent.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
["curl", "-sL", "-o", str(dest), url],
capture_output=True,
text=True,
)
return result.returncode == 0 and dest.exists() and dest.stat().st_size > 0
def main():
missing = get_missing_badges()
if not missing:
print("All badge images already exist!")
return
print(f"Missing {len(missing)} badge images:")
for b in missing:
print(f" {b}")
# Build mapping: badge_path -> bulbapedia_filename
path_to_filename = {}
for badge_path in missing:
path_to_filename[badge_path] = badge_path_to_bulbapedia_filename(badge_path)
print(f"\nResolving {len(path_to_filename)} image URLs from Bulbapedia...")
filenames = list(set(path_to_filename.values()))
url_map = resolve_image_urls(filenames)
# Download
success = 0
failed = []
for badge_path, bp_filename in sorted(path_to_filename.items()):
url = url_map.get(bp_filename)
if not url:
print(f" FAILED: {badge_path} (no URL for {bp_filename})")
failed.append((badge_path, bp_filename))
continue
dest = BADGES_DIR / Path(badge_path).name
if download_file(url, dest):
print(f" OK: {badge_path}")
success += 1
else:
print(f" FAILED: {badge_path} (download error)")
failed.append((badge_path, bp_filename))
print(f"\nDownloaded: {success}/{len(missing)}")
if failed:
print(f"Failed ({len(failed)}):")
for badge_path, bp_filename in failed:
print(f" {badge_path} -> {bp_filename}")
if __name__ == "__main__":
main()