Add core encounter processing pipeline

Filter by game version, parse levels and rate variants across all
generations, aggregate encounters by pokemon+method, and build
parent/child route hierarchy. Also completes encounter method coverage
(73/73) and pokemon form mapping (1180/1181) with manual overrides.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Julian Tabel
2026-02-11 10:12:55 +01:00
parent df7ea64b9e
commit d80c59047c
5 changed files with 557 additions and 13 deletions

View File

@@ -18,6 +18,7 @@ from pathlib import Path
from .loader import load_pokedb_data, load_seed_config
from .mappings import PokemonMapper, LocationMapper, build_version_map, map_encounter_method
from .processing import filter_encounters_for_game, process_encounters, build_routes
SEEDS_DIR_CANDIDATES = [
Path("backend/src/app/seeds"), # from repo root
@@ -141,8 +142,47 @@ def main(argv: list[str] | None = None) -> None:
pokemon_mapper.report_unmapped()
# TODO: Processing pipeline (subtasks rfg0, gkcy)
print("\nMappings built. Processing pipeline not yet implemented.")
# Process encounters per game
print("\nProcessing encounters...")
games_to_process: list[tuple[str, str, int]] = [] # (vg_key, game_slug, generation)
for vg_key, vg_info in config.version_groups.items():
generation = vg_info.get("generation", 0)
for slug in vg_info.get("versions", []):
if target_game and slug != target_game:
continue
games_to_process.append((vg_key, slug, generation))
for vg_key, game_slug, generation in games_to_process:
print(f"\n--- {game_slug} ---")
# Filter encounters for this game
game_encounters = filter_encounters_for_game(pokedb.encounters, game_slug)
if not game_encounters:
print(f" No encounters found in PokeDB data")
continue
print(f" Raw encounters: {len(game_encounters)}")
# Process into grouped encounters
encounters_by_area = process_encounters(
game_encounters, generation, pokemon_mapper, location_mapper,
)
print(f" Location areas with encounters: {len(encounters_by_area)}")
# Build route hierarchy
routes = build_routes(encounters_by_area, location_mapper)
# Stats
total_routes = sum(1 + len(r.children) for r in routes)
total_enc = sum(
len(r.encounters) + sum(len(c.encounters) for c in r.children)
for r in routes
)
print(f" Routes: {total_routes}")
print(f" Encounter entries: {total_enc}")
print("\nProcessing complete. Output not yet written (subtask gkcy).")
if __name__ == "__main__":

View File

@@ -89,15 +89,42 @@ ENCOUNTER_METHOD_MAP: dict[str, str] = {
"ambush": "walk",
# Seaweed / diving
"diving": "surf",
"diving-seaweed": "surf",
"seaweed": "surf",
# Raids
"max-raid": "raid",
"max-raid-battle": "raid",
"dynamax-adventure": "raid",
"tera-raid": "raid",
"tera-raid-battle": "raid",
"fixed-tera-encounter": "static",
# Misc
"roaming": "roaming",
"safari-zone": "walk",
"bug-contest": "walk",
"dust-cloud": "walk",
"hidden-grotto": "static",
"hidden-encounter": "walk",
"horde-encounter": "walk",
"shaking-trees": "walk",
"shaking-ore-deposits": "walk",
"island-scan": "static",
"mass-outbreak": "swarm",
"npc-buy": "gift",
"special-encounter": "static",
"sea-skim": "surf",
"midair": "walk",
"mr-backlot": "walk",
"hoenn-sound": "walk",
"sinnoh-sound": "walk",
"curry": "gift",
"boxes": "gift",
"berry-tree": "walk",
"zygarde-cube-assemble": "static",
"contact-flock": "walk",
"contact-space-time-distortion": "walk",
"contact-unown-reasearch-notes": "static",
"flying-pokemon-shadow": "walk",
}
# Prefix-based fallbacks for methods not explicitly listed above.
@@ -107,6 +134,8 @@ _METHOD_PREFIX_MAP: list[tuple[str, str]] = [
("fishing-", "fishing"),
("headbutt-", "headbutt"),
("flying-", "walk"),
("ambush-", "walk"),
("contact-", "walk"),
]
@@ -180,14 +209,119 @@ def build_version_map(
# Pokemon form mapping
# ---------------------------------------------------------------------------
# PokeDB uses adjectival region forms ("alolan") while PokeAPI/our data uses
# region names ("alola"). This maps PokeDB suffixes → our suffixes.
_FORM_SUFFIX_MAP: dict[str, str] = {
"alolan": "alola",
"galarian": "galar",
"hisuian": "hisui",
"paldean": "paldea",
# Totem forms
"alolan-totem": "totem-alola",
# Basculin stripes
"blue-stripe": "blue-striped",
"red-stripe": "red-striped",
"white-stripe": "white-striped",
# Sea forms
"west-sea": "west",
"east-sea": "east",
# Cloak forms
"plant-cloak": "plant",
"sandy-cloak": "sandy",
"trash-cloak": "trash",
# Eiscue
"ice-face": "ice",
# Misc forms
"pompom": "pom-pom",
"10p": "10",
"50p": "50",
"owntempo": "own-tempo",
"two": "two-segment",
"chest": "chest-form",
"ice-rider": "ice",
"shadow-rider": "shadow",
"apex": "apex-build",
"ultimate": "ultimate-mode",
"black-activated": "black",
"white-activated": "white",
"hero": "hero",
"sword": "crowned",
"shield": "crowned",
# Gigantamax
"gigantamax": "gmax",
# Partner forms
"partner": "partner-cap",
# Flabébé / Floette / Florges color forms — these don't have form suffixes in our data
# since each color is just the base form. Map to base.
"blue": "blue",
"orange": "orange",
"red": "red",
"white": "white",
"yellow": "yellow",
# Gender forms
"female": "female",
"male": "male",
# Furfrou
"natural": "natural",
# Cherrim
"overcast": "overcast",
# Sinistea / Polteageist
"antique": "antique",
"phony": "phony",
# Poltchageist / Sinistcha
"artisan": "artisan",
"counterfeit": "counterfeit",
"masterpiece": "masterpiece",
"unremarkable": "unremarkable",
# Minior cores
"blue-core": "blue",
"green-core": "green",
"indigo-core": "indigo",
"orange-core": "orange",
"red-core": "red",
"violet-core": "violet",
"yellow-core": "yellow",
# Vivillon
"fancy": "fancy",
# Squawkabilly
# these use same name
# Xerneas
"neutral": "neutral",
# Deerling / Sawsbuck
"spring": "spring",
"summer": "summer",
"autumn": "autumn",
"winter": "winter",
# Spiky-ears Pichu
"spiky-ears": "spiky-eared",
# Paldean breeds
"paldean-combat-breed": "paldea-combat-breed",
"paldean-blaze-breed": "paldea-blaze-breed",
"paldean-aqua-breed": "paldea-aqua-breed",
}
def _normalize_slug(identifier: str) -> str:
"""Normalize a PokeDB pokemon_form_identifier to a PokeAPI-style slug.
PokeDB uses "pidgey-default" for base forms — strip the "-default" suffix.
Non-default forms like "rattata-alola" are already PokeAPI-style slugs.
For alternate forms, translate PokeDB naming conventions to ours.
"""
if identifier.endswith("-default"):
return identifier[: -len("-default")]
# Try suffix-based mapping: split into species + form suffix
# e.g. "rattata-alolan" → species="rattata", suffix="alolan"
# e.g. "mr-mime-galarian" → need to find the right split point
# Strategy: try longest suffix first
for pokedb_suffix, our_suffix in sorted(
_FORM_SUFFIX_MAP.items(), key=lambda x: -len(x[0])
):
if identifier.endswith("-" + pokedb_suffix):
species = identifier[: -(len(pokedb_suffix) + 1)]
return f"{species}-{our_suffix}"
return identifier
@@ -234,6 +368,21 @@ def _name_to_form_slug(name: str) -> str | None:
return None
# Manual overrides for PokeDB identifiers that can't be resolved generically.
# These are cases where our pokemon.json uses non-standard base form names
# (e.g. "Deoxys Normal" instead of "Deoxys").
_FORM_OVERRIDES: dict[str, tuple[int, str]] = {
"deoxys-default": (386, "Deoxys Normal"),
"darmanitan-galarian": (10177, "Darmanitan (Galar Standard)"),
"mimikyu-totem": (10144, "Mimikyu (Totem Disguised)"),
"squawkabilly-green": (931, "Squawkabilly Green Plumage"),
"squawkabilly-blue": (10260, "Squawkabilly (Blue Plumage)"),
"squawkabilly-white": (10262, "Squawkabilly (White Plumage)"),
"squawkabilly-yellow": (10261, "Squawkabilly (Yellow Plumage)"),
"toxtricity-gigantamax": (849, "Toxtricity Amped"),
}
class PokemonMapper:
"""Maps PokeDB pokemon_form_identifier → (pokeapi_id, display_name)."""
@@ -277,6 +426,10 @@ class PokemonMapper:
if not pokemon_form_identifier:
return None
# Check manual overrides first
if pokemon_form_identifier in _FORM_OVERRIDES:
return _FORM_OVERRIDES[pokemon_form_identifier]
slug = _normalize_slug(pokemon_form_identifier)
# Direct slug match
@@ -292,6 +445,19 @@ class PokemonMapper:
self._slug_to_info[slug] = (pokemon_id, name)
return (pokemon_id, name)
# Fallback: strip form suffix to find base species.
# Many cosmetic forms (colors, genders, seasons) don't have separate
# entries in our pokemon.json — they use the base species entry.
# Try progressively shorter slugs: "flabebe-blue" → "flabebe"
parts = slug.split("-")
for i in range(len(parts) - 1, 0, -1):
base = "-".join(parts[:i])
if base in self._slug_to_info:
result = self._slug_to_info[base]
# Cache for future lookups
self._slug_to_info[slug] = result
return result
# Track unmapped
if pokemon_form_identifier not in self._unmapped:
self._unmapped.add(pokemon_form_identifier)

View File

@@ -0,0 +1,338 @@
"""Core encounter processing: filter, parse, aggregate, and group encounters."""
from __future__ import annotations
import re
from typing import Any
from .mappings import LocationMapper, PokemonMapper, map_encounter_method
from .models import Encounter, Route
# ---------------------------------------------------------------------------
# Rate parsing
# ---------------------------------------------------------------------------
# Word-based rates → numeric value
_WORD_RATES: dict[str, int] = {
"one": 100,
"two": 50,
"three": 33,
"four": 25,
"five": 20,
"six": 17,
"seven": 14,
"eight": 13,
"choose one": 100,
"one of three": 33,
"only one": 100,
"unlimited": 100,
"respawns": 100,
"common": 60,
"average": 30,
"rare": 10,
"varies": 50,
}
_PERCENT_RE = re.compile(r"~?(\d+(?:\.\d+)?)%?")
def parse_rate(value: str | None) -> int | None:
"""Parse a rate string into an integer percentage (0-100).
Handles formats: "50%", "~10%", "one", "common", "100", "??%", etc.
Returns None if unparseable.
"""
if not value:
return None
value = value.strip()
# Word-based
lower = value.lower()
if lower in _WORD_RATES:
return _WORD_RATES[lower]
# Unknown
if value == "??%":
return None
# Numeric percentage: "50%", "~10%", "10.14%", or bare "100"
m = _PERCENT_RE.match(value)
if m:
return max(1, round(float(m.group(1))))
return None
def extract_encounter_rate(record: dict[str, Any], generation: int) -> int:
"""Extract a single encounter_rate from a PokeDB encounter record.
Flattens generation-specific rate variants into a single value.
"""
# Gen 1/3/6: rate_overall
rate_overall = parse_rate(record.get("rate_overall"))
if rate_overall is not None:
return rate_overall
# Gen 2/4: time-of-day rates — take the max
time_rates = [
parse_rate(record.get("rate_morning")),
parse_rate(record.get("rate_day")),
parse_rate(record.get("rate_night")),
]
time_rates = [r for r in time_rates if r is not None]
if time_rates:
return max(time_rates)
# Gen 5: seasonal rates — take the max
season_rates = [
parse_rate(record.get("rate_spring")),
parse_rate(record.get("rate_summer")),
parse_rate(record.get("rate_autumn")),
parse_rate(record.get("rate_winter")),
]
season_rates = [r for r in season_rates if r is not None]
if season_rates:
return max(season_rates)
# Gen 8 Sw/Sh: weather rates — take the max
weather_rates = []
for key, val in record.items():
if key.startswith("weather_") and key.endswith("_rate") and val:
parsed = parse_rate(val)
if parsed is not None:
weather_rates.append(parsed)
if weather_rates:
return max(weather_rates)
# Gen 8 Legends Arceus: boolean conditions → presence-based
if record.get("during_any_time") or record.get("during_morning") or \
record.get("during_day") or record.get("during_evening") or record.get("during_night"):
return 100 # Present under conditions
# Gen 9 Sc/Vi: probability weights → normalize
prob_overall = record.get("probability_overall")
if prob_overall:
parsed = parse_rate(prob_overall)
if parsed is not None:
# These are spawn weights (e.g. "20", "300"), not percentages.
# We'll normalize them later during aggregation when we have
# all encounters for a location. For now, store the raw weight.
return parsed
# Check time-based probability variants
prob_rates = [
parse_rate(record.get("probability_morning")),
parse_rate(record.get("probability_day")),
parse_rate(record.get("probability_evening")),
parse_rate(record.get("probability_night")),
]
prob_rates = [r for r in prob_rates if r is not None]
if prob_rates:
return max(prob_rates)
# Fallback: gift/trade/static encounters with no rate
return 100
# ---------------------------------------------------------------------------
# Level parsing
# ---------------------------------------------------------------------------
def parse_levels(levels_str: str | None) -> tuple[int, int]:
"""Parse a level string into (min_level, max_level).
"2 - 4" → (2, 4)
"67" → (67, 67)
"44 - 51" → (44, 51)
Returns (1, 1) if unparseable.
"""
if not levels_str:
return (1, 1)
levels_str = levels_str.strip()
# Range: "2 - 4" or "2-4"
m = re.match(r"(\d+)\s*-\s*(\d+)", levels_str)
if m:
return (int(m.group(1)), int(m.group(2)))
# Single: "67"
m = re.match(r"(\d+)", levels_str)
if m:
level = int(m.group(1))
return (level, level)
return (1, 1)
# ---------------------------------------------------------------------------
# Core processing
# ---------------------------------------------------------------------------
def filter_encounters_for_game(
encounters: list[dict[str, Any]],
game_slug: str,
) -> list[dict[str, Any]]:
"""Filter PokeDB encounters to only those for a specific game version."""
return [
e for e in encounters
if game_slug in (e.get("version_identifiers") or [])
]
def process_encounters(
raw_encounters: list[dict[str, Any]],
generation: int,
pokemon_mapper: PokemonMapper,
location_mapper: LocationMapper,
) -> dict[str, list[Encounter]]:
"""Process raw PokeDB encounters into grouped-by-location-area Encounter objects.
Returns {location_area_identifier: [Encounter, ...]}.
"""
by_area: dict[str, list[Encounter]] = {}
for record in raw_encounters:
# Map encounter method
method_id = record.get("encounter_method_identifier", "")
method = map_encounter_method(method_id) if method_id else None
if method is None:
continue
# Map pokemon
form_id = record.get("pokemon_form_identifier")
pokemon_info = pokemon_mapper.lookup(form_id)
if pokemon_info is None:
continue
pokeapi_id, pokemon_name = pokemon_info
# Parse levels
min_level, max_level = parse_levels(record.get("levels"))
# Extract rate
encounter_rate = extract_encounter_rate(record, generation)
# Location area
area_id = record.get("location_area_identifier", "")
if not area_id:
continue
enc = Encounter(
pokeapi_id=pokeapi_id,
pokemon_name=pokemon_name,
method=method,
encounter_rate=encounter_rate,
min_level=min_level,
max_level=max_level,
)
by_area.setdefault(area_id, []).append(enc)
return by_area
def aggregate_encounters(encounters: list[Encounter]) -> list[Encounter]:
"""Aggregate encounters by (pokeapi_id, method), merging level ranges and summing rates.
Replicates the Go tool's aggregation logic.
"""
key_type = tuple[int, str]
agg: dict[key_type, Encounter] = {}
order: list[key_type] = []
for enc in encounters:
k = (enc.pokeapi_id, enc.method)
if k in agg:
existing = agg[k]
existing.encounter_rate += enc.encounter_rate
existing.min_level = min(existing.min_level, enc.min_level)
existing.max_level = max(existing.max_level, enc.max_level)
else:
# Copy so we don't mutate the original
agg[k] = Encounter(
pokeapi_id=enc.pokeapi_id,
pokemon_name=enc.pokemon_name,
method=enc.method,
encounter_rate=enc.encounter_rate,
min_level=enc.min_level,
max_level=enc.max_level,
)
order.append(k)
result = []
for k in order:
e = agg[k]
e.encounter_rate = min(e.encounter_rate, 100)
result.append(e)
# Sort by rate descending, then name ascending
result.sort(key=lambda e: (-e.encounter_rate, e.pokemon_name))
return result
def build_routes(
encounters_by_area: dict[str, list[Encounter]],
location_mapper: LocationMapper,
) -> list[Route]:
"""Group encounters by location, building parent/child route hierarchy.
Multiple areas under the same location → parent route with children.
Single area → flat route.
"""
# Group areas by their parent location identifier
loc_groups: dict[str, list[tuple[str, str, list[Encounter]]]] = {}
# loc_id → [(area_id, area_display_name, encounters), ...]
for area_id, encounters in encounters_by_area.items():
loc_id = location_mapper.get_location_identifier(area_id)
if not loc_id:
loc_id = area_id # fallback
area_name = location_mapper.get_area_name(area_id)
loc_groups.setdefault(loc_id, []).append((area_id, area_name, encounters))
routes: list[Route] = []
for loc_id, areas in loc_groups.items():
loc_name = location_mapper.get_location_name(areas[0][0])
if len(areas) == 1:
# Single area — flat route
_, area_name, encounters = areas[0]
aggregated = aggregate_encounters(encounters)
if aggregated:
# If the area has a distinct name different from the location, use it
route_name = area_name if area_name and area_name != loc_name else loc_name
routes.append(Route(name=route_name, order=0, encounters=aggregated))
else:
# Multiple areas — check if encounters differ
children: list[Route] = []
all_encounters: list[Encounter] = []
for _, area_name, encounters in areas:
aggregated = aggregate_encounters(encounters)
if aggregated:
if area_name and area_name != loc_name:
child_name = area_name
else:
child_name = loc_name
children.append(Route(name=child_name, order=0, encounters=aggregated))
all_encounters.extend(encounters)
if len(children) > 1:
# Parent with children
routes.append(Route(
name=loc_name,
order=0,
encounters=[],
children=children,
))
elif len(children) == 1:
# Only one area had encounters — flatten
routes.append(children[0])
return routes