#!/usr/bin/env python3 """One-time script to download badge images from Bulbapedia.""" import json import re import subprocess import sys from pathlib import Path BADGES_DIR = Path(__file__).resolve().parent.parent / "frontend" / "public" / "badges" SEEDS_DIR = ( Path(__file__).resolve().parent.parent / "backend" / "src" / "app" / "seeds" / "data" ) MEDIAWIKI_API = "https://archives.bulbagarden.net/w/api.php" def get_referenced_badges() -> set[str]: """Extract all unique non-null badge_image_url from seed files.""" badges = set() for f in SEEDS_DIR.glob("*-bosses.json"): data = json.loads(f.read_text()) for boss in data: url = boss.get("badge_image_url") if url: badges.add(url) return badges def get_missing_badges() -> list[str]: """Return badge paths that are referenced but don't exist on disk.""" referenced = get_referenced_badges() missing = [] for badge_path in sorted(referenced): full_path = BADGES_DIR / Path(badge_path).name if not full_path.exists(): missing.append(badge_path) return missing def badge_path_to_bulbapedia_filename(badge_path: str) -> str: """Convert /badges/coal-badge.png -> Coal_Badge.png""" name = Path(badge_path).stem # e.g. "coal-badge" parts = name.split("-") # ["coal", "badge"] title_parts = [p.capitalize() for p in parts] return "_".join(title_parts) + ".png" def resolve_image_urls(filenames: list[str]) -> dict[str, str | None]: """Use MediaWiki API to resolve image filenames to direct URLs.""" results = {} # Process in batches of 50 for i in range(0, len(filenames), 50): batch = filenames[i : i + 50] titles = "|".join(f"File:{fn}" for fn in batch) cmd = [ "curl", "-s", f"{MEDIAWIKI_API}?action=query&titles={titles}" "&prop=imageinfo&iiprop=url&format=json", ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) data = json.loads(result.stdout) # Build normalization map (API normalizes underscores to spaces) norm_map = {} for entry in data.get("query", {}).get("normalized", []): norm_map[entry["to"]] = entry["from"] pages = data.get("query", {}).get("pages", {}) for page in pages.values(): title = page.get("title", "").replace("File:", "") # Map back to original underscore form original = norm_map.get(f"File:{title}", f"File:{title}").replace( "File:", "" ) imageinfo = page.get("imageinfo", []) if imageinfo: results[original] = imageinfo[0]["url"] else: results[original] = None return results def download_file(url: str, dest: Path) -> bool: """Download a file using curl.""" dest.parent.mkdir(parents=True, exist_ok=True) result = subprocess.run( ["curl", "-sL", "-o", str(dest), url], capture_output=True, text=True, ) return result.returncode == 0 and dest.exists() and dest.stat().st_size > 0 def main(): missing = get_missing_badges() if not missing: print("All badge images already exist!") return print(f"Missing {len(missing)} badge images:") for b in missing: print(f" {b}") # Build mapping: badge_path -> bulbapedia_filename path_to_filename = {} for badge_path in missing: path_to_filename[badge_path] = badge_path_to_bulbapedia_filename(badge_path) print(f"\nResolving {len(path_to_filename)} image URLs from Bulbapedia...") filenames = list(set(path_to_filename.values())) url_map = resolve_image_urls(filenames) # Download success = 0 failed = [] for badge_path, bp_filename in sorted(path_to_filename.items()): url = url_map.get(bp_filename) if not url: print(f" FAILED: {badge_path} (no URL for {bp_filename})") failed.append((badge_path, bp_filename)) continue dest = BADGES_DIR / Path(badge_path).name if download_file(url, dest): print(f" OK: {badge_path}") success += 1 else: print(f" FAILED: {badge_path} (download error)") failed.append((badge_path, bp_filename)) print(f"\nDownloaded: {success}/{len(missing)}") if failed: print(f"Failed ({len(failed)}):") for badge_path, bp_filename in failed: print(f" {badge_path} -> {bp_filename}") if __name__ == "__main__": main()