diff --git a/scripts/scrape-trainer-names/check-gender.js b/scripts/scrape-trainer-names/check-gender.js index 641305708bf..0a8d2aecdaa 100644 --- a/scripts/scrape-trainer-names/check-gender.js +++ b/scripts/scrape-trainer-names/check-gender.js @@ -1,45 +1,53 @@ /** * Check if the given trainer class is female. * @param {Document} document - The HTML document to scrape - * @returns {[gender: boolean, counterpartURL?: string]} A 2-length tuple containing: - * 1. The trainer class' normal gender - * 2. A URL to the gender counterpart of the current class (if the trainer has one). + * @returns {[gender: boolean, counterpartURLs: string[]]} A 2-length tuple containing: + * 1. The trainer class' gender (female or not) + * 2. A list of all the current class' opposite-gender counterparts (if the trainer has any). */ export function checkGenderAndType(document) { - const infoBox = document.getElementById("infobox"); + const infoBox = document.getElementsByClassName("infobox")[0]; if (!infoBox) { - return [false]; + return [false, []]; } // Find the row of the table containing the specified gender - const children = [...infoBox.childNodes]; - const genderCell = children.find( - node => node.nodeName === "tr" && [...node.childNodes].some(c => c.textContent?.includes("Gender")), - )?.parentElement; - if (!genderCell) { - return [false]; + const children = [...infoBox.getElementsByTagName("tr")]; + const genderCell = children.find(node => [...node.childNodes].some(c => c.textContent?.includes("Gender"))); + const tableBox = genderCell?.querySelector("td"); + if (!tableBox) { + return [false, []]; } - const gender = getGender(genderCell.querySelector("tr")); - const hrefExtractRegex = /href="\/wiki\/(.*)_(Trainer_class)"/g; - const counterpartURL = genderCell.querySelector("td")?.getHTML().match(hrefExtractRegex)?.[1]; + const gender = getGender(tableBox); - return [gender, counterpartURL]; + // CHeck the cell's inner HTML for any `href`s to gender counterparts and scrape them too + const hrefExtractRegex = /href="\/wiki\/(.*?)_\(Trainer_class\)"/g; + const counterpartCell = children.find(node => [...node.childNodes].some(c => c.textContent?.includes("Counterpart"))); + + const counterpartURLs = []; + for (const url of counterpartCell?.innerHTML?.matchAll(hrefExtractRegex) ?? []) { + counterpartURLs.push(url[1]); + } + + return [gender, counterpartURLs]; } /** * Retrieve the gender from the given node text. - * @param {HTMLTableRowElement?} genderCell - The cell to check + * @param {HTMLTableCellElement} genderCell - The cell to check * @returns {boolean} The gender type * @todo Handle trainers whose gender type has changed across different gens (Artists, etc.) */ function getGender(genderCell) { - switch (genderCell?.textContent) { - case "Female Only": - return false; - case "Male Only": - case "Both": + const gender = genderCell.textContent?.trim().toLowerCase() ?? ""; + + switch (gender) { + case "female only": + return true; + case "male only": + case "both": case undefined: default: - return true; + return false; } } diff --git a/scripts/scrape-trainer-names/fetch-names.js b/scripts/scrape-trainer-names/fetch-names.js index 03e7c5b66f6..e2c9bd3093a 100644 --- a/scripts/scrape-trainer-names/fetch-names.js +++ b/scripts/scrape-trainer-names/fetch-names.js @@ -1,42 +1,28 @@ -import chalk from "chalk"; -import { JSDOM } from "jsdom"; -import { checkGenderAndType } from "./check-gender.js"; - /** - * @import { nameRecord, parsedNames } from "./types.js"; + * @import { parsedNames } from "./types.js"; */ /** - * Fetch a given trainer's names from the given URL. - * @param {string} url - The URL to parse - * @param {boolean} [currGender] - The current class' known gender. - * If provided, will override the natural gender detection with the given gender and avoid - * checking any gender counterparts. - * @returns {Promise} A Promise that resolves with the parsed names once the parsing concludes. - * Will resolve with an empty array if the name could not be parsed. + * An error code for a bad URL. */ -export async function fetchNames(url, currGender) { - const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${url}_(Trainer_class)`)).window; - const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement; +export const INVALID_URL = "bad_url_code"; + +/** @type {const} */ + +/** + * Fetch a given trainer's names from the given HTML document. + * @param {HTMLElement | null | undefined} trainerListHeader - The header containing the trainer lists + * @param {boolean} [knownFemale=false] - Whether the class is known to be female; default `false` + * @returns {parsedNames | INVALID_URL} + * An object containing the parsed names. \ + * Will instead return with {@linkcode INVALID_URL} if the data is invalid. + */ +export function fetchNames(trainerListHeader, knownFemale = false) { + const trainerNames = /** @type {Set} */ (new Set()); + const femaleTrainerNames = /** @type {Set} */ (new Set()); if (!trainerListHeader?.parentElement?.childNodes) { - console.warn(chalk.hex("#ffa500")(`URL ${url} did not correspond to a valid trainer class!`)); - return { male: [], female: [] }; - } - - let trainerNames = /** @type {Set} */ (new Set()); - let femaleTrainerNames = /** @type {Set} */ (new Set()); - - // If we don't know whether this class is female, check, optionally recursing into the counterpart's webpage as well. - if (currGender === undefined) { - /** @type {string | undefined} */ - let counterpartURL; - [currGender, counterpartURL] = checkGenderAndType(document); - if (counterpartURL) { - console.log(chalk.green(`Accessing gender counterpart URL: ${counterpartURL}`)); - const names = await fetchNames(counterpartURL, !currGender); - trainerNames = new Set(names.male); - femaleTrainerNames = new Set(names.female); - } + // Return early if no child nodes (ie tables) can be found + return INVALID_URL; } const elements = [...trainerListHeader.parentElement.childNodes]; @@ -55,7 +41,7 @@ export async function fetchNames(url, currGender) { ), ); - parseTable(tables, currGender, trainerNames, femaleTrainerNames); + parseTable(tables, knownFemale, trainerNames, femaleTrainerNames); return { male: Array.from(trainerNames), female: Array.from(femaleTrainerNames), diff --git a/scripts/scrape-trainer-names/help-message.js b/scripts/scrape-trainer-names/help-message.js new file mode 100644 index 00000000000..fe922a5e988 --- /dev/null +++ b/scripts/scrape-trainer-names/help-message.js @@ -0,0 +1,16 @@ +import chalk from "chalk"; + +/** Show help/usage text for the `scrape-trainers` CLI. */ +export function showHelpText() { + console.log(` +Usage: ${chalk.cyan("pnpm scrape-trainers [options] ")} +Note that all option names are ${chalk.bold("case insensitive")}. + +${chalk.hex("#8a2be2")("Arguments:")} + ${chalk.hex("#7fff00")("names")} The name of one or more trainer classes to parse. + +${chalk.hex("#ffa500")("Options:")} + ${chalk.blue("-h, --help")} Show this help message. + ${chalk.blue("-o, --out, --outfile")} The path to a file to save the output. If not provided, will send directly to stdout. +`); +} diff --git a/scripts/scrape-trainer-names/main.js b/scripts/scrape-trainer-names/main.js index 98f7860442f..5d86f30d18c 100644 --- a/scripts/scrape-trainer-names/main.js +++ b/scripts/scrape-trainer-names/main.js @@ -1,36 +1,264 @@ -import { toCamelCase, toPascalSnakeCase } from "../helpers/strings.js"; -import { fetchNames } from "./fetch-names.js"; +import { existsSync, writeFileSync } from "node:fs"; +import { format } from "node:util"; +import chalk from "chalk"; +import inquirer from "inquirer"; +import { JSDOM } from "jsdom"; +import { toCamelCase, toPascalSnakeCase, toTitleCase } from "../helpers/strings.js"; +import { checkGenderAndType } from "./check-gender.js"; +import { fetchNames, INVALID_URL } from "./fetch-names.js"; +import { showHelpText } from "./help-message.js"; /** * @packageDocumentation * This script will scrape Bulbapedia for the English names of a given trainer class, * outputting them as JSON. - * Usage: + * Usage: `pnpm scrape-trainers` */ /** - * Scrape the requested trainer names and format the resultant output. - * @param {...string} classes The names of the trainer classes to retrieve - * @returns {Promise} A Promise that resolves with the finished text. + * @import { parsedNames } from "./types.js" */ -async function scrapeTrainerNames(...classes) { - /** - * A large object mapping each class to their corresponding list of trainer names. \ - * Trainer classes with only 1 gender will only contain the single array for that gender. - * @type {Record} - */ - const nameTuples = Object.fromEntries( - await Promise.all( - classes.map(async trainerClass => { - // Bulba URLs use Pascal_Snake_Case (Bug_Catcher) - const classURL = toPascalSnakeCase(trainerClass); - const names = await fetchNames(classURL); - const namesObj = names.female.length === 0 ? names.male : names; - return [toCamelCase(trainerClass), namesObj]; - }), - ), - ); - return JSON.stringify(nameTuples, null, 2); + +const version = "1.0.0"; +const SUPPORTED_ARGS = /** @type {const} */ (["-o", "--outfile", "--outFile"]); + +/** + * A large object mapping each "base" trainer name to a list of replacements. + * Used to allow for trainer classes with different `TrainerType`s than in mainline. + * @type {Record} + */ +const trainerNamesMap = { + pokemonBreeder: ["breeder"], + worker: ["worker", "snowWorker"], + richBoy: ["richKid"], + gentleman: ["rich"], +}; + +async function main() { + console.log(chalk.hex("#FF7F50")(`🍳 Trainer Name Scraper v${version}`)); + + const args = process.argv.slice(2); + const out = getOutfile(args); + // Break out if no args remain + if (args.length === 0) { + console.error( + chalk.red.bold( + `✗ Error: No trainer classes provided!\nArgs: ${chalk.hex("#7310fdff")(process.argv.slice(2).join(", "))}`, + ), + ); + showHelpText(); + process.exitCode = 1; + return; + } + + const output = await scrapeTrainerNames(args); + await tryWriteFile(out, output); } -console.log(await scrapeTrainerNames("doctor")); +/** + * Get the outfile location from the args array. + * @param {string[]} args - The command line arguments + * @returns {string | undefined} The outfile location, or `undefined` if none is provided + * @remarks + * This will mutate the `args` array by removing the outfile from the list of arguments. + */ +function getOutfile(args) { + let /** @type {string} */ outFile; + // Extract the argument as either the form "x=y" or "x y". + const hasEquals = args[0]?.match(/^(.*)=(.*)$/g); + if (hasEquals) { + outFile = hasEquals[2]; + args.splice(0, 1); + } else if (/** @type {readonly string[]} */ (SUPPORTED_ARGS).includes(args[0])) { + outFile = args[1]; + args.splice(0, 2); + } else { + console.log(chalk.hex("#ffa500")("No outfile detected, logging to stdout...")); + return; + } + + console.log(chalk.hex("#ffa500")(`Using outfile: ${chalk.blue(outFile)}`)); + return outFile; +} + +/** + * Scrape the requested trainer names and format the resultant output. + * @param {string[]} classes The names of the trainer classes to retrieve + * @returns {Promise} A Promise that resolves with the finished text. + */ +async function scrapeTrainerNames(classes) { + classes = [...new Set(classes)]; + + /** + * A Set containing all trainer URLs that have been seen. + * @type {Set} + */ + const seenClasses = new Set(); + + /** + * A large array of tuples matching each class to their corresponding list of trainer names. \ + * Trainer classes with only 1 gender will only contain the single array for that gender. + * @type {[keyName: string, names: string[] | parsedNames][]} + */ + const namesTuples = await Promise.all( + classes.map(async trainerClass => { + const [trainerName, names] = await doFetch(trainerClass, seenClasses); + const namesObj = names.female.length === 0 ? names.male : names; + return /** @type {const} */ ([trainerName, namesObj]); + }), + ); + + // Grab all keys inside the name replacement map and change them accordingly. + const mappedNames = namesTuples.filter(tuple => tuple[0] in trainerNamesMap); + for (const nameTuple of mappedNames) { + const namesMapping = trainerNamesMap[nameTuple[0]]; + namesTuples.splice( + namesTuples.indexOf(nameTuple), + 1, + ...namesMapping.map( + name => /** @type {[keyName: string, names: parsedNames | string[]]} */ ([name, nameTuple[1]]), + ), + ); + } + + namesTuples.sort((a, b) => a[0].localeCompare(b[0])); + + /** @type {Record} */ + const namesRecord = Object.fromEntries(namesTuples); + + // Convert all arrays into objects indexed by the number + return JSON.stringify( + namesRecord, + (_, v) => { + if (Array.isArray(v)) { + return v.reduce((ret, curr, i) => { + ret[i + 1] = curr; // 1 indexed + return ret; + }, {}); + } + return v; + }, + 2, + ); +} + +/** + * Recursively scrape names from a given Trainer class and its gender counterparts. + * @param {string} trainerClass - The URL to parse + * @param {Set} seenClasses - A Set containing all seen class URLs, used for record keeping. + * @returns {Promise<[string, parsedNames]>} + * A Promise that resolves with: + * 1. The name to use for the key. + * 2. All fetched names for this trainer class and its gender variants. + */ +async function doFetch(trainerClass, seenClasses) { + let keyName = toCamelCase(trainerClass); + const classURL = toPascalSnakeCase(trainerClass); + seenClasses.add(classURL); + + const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${classURL}_(Trainer_class)`)) + .window; + const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement; + const [female, counterpartURLs] = checkGenderAndType(document); + const names = fetchNames(trainerListHeader, female); + if (names === INVALID_URL) { + return Promise.reject(chalk.red.bold(`URL ${classURL} did not correspond to a valid trainer class!`)); + } + + // Recurse into all unseen gender counterparts' URLs, using the first male name we find + const counterpartNames = await Promise.all( + counterpartURLs + .filter(url => !seenClasses.has(url)) + .map(counterpartURL => { + console.log(chalk.green(`Accessing gender counterpart URL: ${toTitleCase(counterpartURL)}`)); + return doFetch(counterpartURL, seenClasses); + }), + ); + let overrodeName = false; + for (const [cKeyName, cNameObj] of counterpartNames) { + if (!overrodeName && female) { + overrodeName = true; + console.log(chalk.green(`Using "${cKeyName}" as the name of the JSON key object...`)); + keyName = cKeyName; + } + names.male = [...new Set(names.male.concat(cNameObj.male))]; + names.female = [...new Set(names.female.concat(cNameObj.female))]; + } + return [normalizeDiacritics(keyName), names]; +} + +/** + * Convert all diacritical marks within a string into their normalized variants. + * @param {string} str - The string to parse + * @returns {string} The string with normalized diacritics + */ +function normalizeDiacritics(str) { + // Normalizing to NFKD splits all diacritics into the base letter + grapheme (à -> a + `), + // which are conveniently all in their own little Unicode block for easy removal + return str.normalize("NFKD").replace(/[\u0300-\u036f]/g, ""); +} + +/** + * Try to write the output to a file (or log it to stdout, as the case may be). + * @param {string | undefined} outFile - The outfile + * @param {string} output - The scraped output to produce + */ +async function tryWriteFile(outFile, output) { + if (!outFile) { + console.log(output); + return; + } + + if (existsSync(outFile) && !(await promptExisting(outFile))) { + process.exitCode = 1; + return; + } + + try { + writeFileSync(outFile, output); + console.log(chalk.green.bold(`✔ Output written to ${chalk.blue(outFile)} successfully!`)); + } catch (e) { + let /** @type {string} */ errStr; + if (!(e instanceof Error)) { + errStr = format("Unknown error occurred: ", e); + } else { + // @ts-expect-error - Node.JS file errors always have codes + switch (e.code) { + case "ENOENT": + errStr = `File not found: ${outFile}`; + break; + case "EACCES": + errStr = `Could not write ${outFile}: Permission denied`; + break; + case "EISDIR": + errStr = `Unable to write to ${outFile} as it is a directory`; + break; + default: + errStr = `Error writing file: ${e.message}`; + } + } + console.error(chalk.red.bold(errStr)); + process.exitCode = 1; + return; + } +} + +/** + * Confirm overwriting an already-existing file. + * @param {string} outFile - The outfile + * @returns {Promise} Whether "Yes" or "No" was selected. + */ +async function promptExisting(outFile) { + return ( + await inquirer.prompt([ + { + type: "confirm", + name: "continue", + message: `File ${chalk.blue(outFile)} already exists!` + "\nDo you want to replace it?", + default: false, + }, + ]) + ).continue; +} + +main(); diff --git a/scripts/scrape-trainer-names/types.js b/scripts/scrape-trainer-names/types.js index f2de7bc487e..0ea07db5164 100644 --- a/scripts/scrape-trainer-names/types.js +++ b/scripts/scrape-trainer-names/types.js @@ -5,3 +5,5 @@ * @property {string[]} male * @property {string[]} female */ + +export {};