Made script actually work

2025-08-19 22:09:27 +02:00 · 2025-08-18 00:36:44 -04:00 · 2025-08-18 00:36:44 -04:00 · 78efc5d130
commit 78efc5d130
parent 43e9d82b26
5 changed files with 321 additions and 81 deletions
--- a/scripts/scrape-trainer-names/check-gender.js
+++ b/scripts/scrape-trainer-names/check-gender.js
@ -1,45 +1,53 @@
 /**
 * Check if the given trainer class is female.
 * @param {Document} document - The HTML document to scrape
- * @returns {[gender: boolean, counterpartURL?: string]} A 2-length tuple containing:
- * 1. The trainer class' normal gender
- * 2. A URL to the gender counterpart of the current class (if the trainer has one).
+ * @returns {[gender: boolean, counterpartURLs: string[]]} A 2-length tuple containing:
+ * 1. The trainer class' gender (female or not)
+ * 2. A list of all the current class' opposite-gender counterparts (if the trainer has any).
 */
 export function checkGenderAndType(document) {
-  const infoBox = document.getElementById("infobox");
+  const infoBox = document.getElementsByClassName("infobox")[0];
  if (!infoBox) {
-    return [false];
+    return [false, []];
  }
  // Find the row of the table containing the specified gender
-  const children = [...infoBox.childNodes];
-  const genderCell = children.find(
-    node => node.nodeName === "tr" && [...node.childNodes].some(c => c.textContent?.includes("Gender")),
-  )?.parentElement;
-  if (!genderCell) {
-    return [false];
+  const children = [...infoBox.getElementsByTagName("tr")];
+  const genderCell = children.find(node => [...node.childNodes].some(c => c.textContent?.includes("Gender")));
+  const tableBox = genderCell?.querySelector("td");
+  if (!tableBox) {
+    return [false, []];
  }

-  const gender = getGender(genderCell.querySelector("tr"));
-  const hrefExtractRegex = /href="\/wiki\/(.*)_(Trainer_class)"/g;
-  const counterpartURL = genderCell.querySelector("td")?.getHTML().match(hrefExtractRegex)?.[1];
+  const gender = getGender(tableBox);

-  return [gender, counterpartURL];
+  // CHeck the cell's inner HTML for any `href`s to gender counterparts and scrape them too
+  const hrefExtractRegex = /href="\/wiki\/(.*?)_\(Trainer_class\)"/g;
+  const counterpartCell = children.find(node => [...node.childNodes].some(c => c.textContent?.includes("Counterpart")));
+
+  const counterpartURLs = [];
+  for (const url of counterpartCell?.innerHTML?.matchAll(hrefExtractRegex) ?? []) {
+    counterpartURLs.push(url[1]);
+  }
+
+  return [gender, counterpartURLs];
 }

 /**
 * Retrieve the gender from the given node text.
- * @param {HTMLTableRowElement?} genderCell - The cell to check
+ * @param {HTMLTableCellElement} genderCell - The cell to check
 * @returns {boolean} The gender type
 * @todo Handle trainers whose gender type has changed across different gens (Artists, etc.)
 */
 function getGender(genderCell) {
-  switch (genderCell?.textContent) {
-    case "Female Only":
-      return false;
-    case "Male Only":
-    case "Both":
+  const gender = genderCell.textContent?.trim().toLowerCase() ?? "";
+
+  switch (gender) {
+    case "female only":
+      return true;
+    case "male only":
+    case "both":
    case undefined:
    default:
-      return true;
+      return false;
  }
 }
--- a/scripts/scrape-trainer-names/fetch-names.js
+++ b/scripts/scrape-trainer-names/fetch-names.js
@ -1,42 +1,28 @@
-import chalk from "chalk";
-import { JSDOM } from "jsdom";
-import { checkGenderAndType } from "./check-gender.js";
-
 /**
- * @import { nameRecord, parsedNames } from "./types.js";
+ * @import { parsedNames } from "./types.js";
 */

 /**
- * Fetch a given trainer's names from the given URL.
- * @param {string} url - The URL to parse
- * @param {boolean} [currGender] - The current class' known gender.
- * If provided, will override the natural gender detection with the given gender and avoid
- * checking any gender counterparts.
- * @returns {Promise<parsedNames>} A Promise that resolves with the parsed names once the parsing concludes.
- * Will resolve with an empty array if the name could not be parsed.
+ * An error code for a bad URL.
 */
-export async function fetchNames(url, currGender) {
-  const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${url}_(Trainer_class)`)).window;
-  const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement;
+export const INVALID_URL = "bad_url_code";
+
+/** @type {const} */
+
+/**
+ * Fetch a given trainer's names from the given HTML document.
+ * @param {HTMLElement | null | undefined} trainerListHeader - The header containing the trainer lists
+ * @param {boolean} [knownFemale=false] - Whether the class is known to be female; default `false`
+ * @returns {parsedNames | INVALID_URL}
+ * An object containing the parsed names. \
+ * Will instead return with {@linkcode INVALID_URL} if the data is invalid.
+ */
+export function fetchNames(trainerListHeader, knownFemale = false) {
+  const trainerNames = /** @type {Set<string>} */ (new Set());
+  const femaleTrainerNames = /** @type {Set<string>} */ (new Set());
  if (!trainerListHeader?.parentElement?.childNodes) {
-    console.warn(chalk.hex("#ffa500")(`URL ${url} did not correspond to a valid trainer class!`));
-    return { male: [], female: [] };
-  }
-
-  let trainerNames = /** @type {Set<string>} */ (new Set());
-  let femaleTrainerNames = /** @type {Set<string>} */ (new Set());
-
-  // If we don't know whether this class is female, check, optionally recursing into the counterpart's webpage as well.
-  if (currGender === undefined) {
-    /** @type {string | undefined} */
-    let counterpartURL;
-    [currGender, counterpartURL] = checkGenderAndType(document);
-    if (counterpartURL) {
-      console.log(chalk.green(`Accessing gender counterpart URL: ${counterpartURL}`));
-      const names = await fetchNames(counterpartURL, !currGender);
-      trainerNames = new Set(names.male);
-      femaleTrainerNames = new Set(names.female);
-    }
+    // Return early if no child nodes (ie tables) can be found
+    return INVALID_URL;
  }

  const elements = [...trainerListHeader.parentElement.childNodes];
@ -55,7 +41,7 @@ export async function fetchNames(url, currGender) {
    ),
  );

-  parseTable(tables, currGender, trainerNames, femaleTrainerNames);
+  parseTable(tables, knownFemale, trainerNames, femaleTrainerNames);
  return {
    male: Array.from(trainerNames),
    female: Array.from(femaleTrainerNames),
--- a/scripts/scrape-trainer-names/help-message.js
+++ b/scripts/scrape-trainer-names/help-message.js
@ -0,0 +1,16 @@
+import chalk from "chalk";
+
+/** Show help/usage text for the `scrape-trainers` CLI. */
+export function showHelpText() {
+  console.log(`
+Usage: ${chalk.cyan("pnpm scrape-trainers [options] <names>")}
+Note that all option names are ${chalk.bold("case insensitive")}.
+
+${chalk.hex("#8a2be2")("Arguments:")}
+  ${chalk.hex("#7fff00")("names")}                    The name of one or more trainer classes to parse.
+
+${chalk.hex("#ffa500")("Options:")}
+  ${chalk.blue("-h, --help")}               Show this help message.
+  ${chalk.blue("-o, --out, --outfile")}     The path to a file to save the output. If not provided, will send directly to stdout.
+`);
+}
--- a/scripts/scrape-trainer-names/main.js
+++ b/scripts/scrape-trainer-names/main.js
@ -1,36 +1,264 @@
-import { toCamelCase, toPascalSnakeCase } from "../helpers/strings.js";
-import { fetchNames } from "./fetch-names.js";
+import { existsSync, writeFileSync } from "node:fs";
+import { format } from "node:util";
+import chalk from "chalk";
+import inquirer from "inquirer";
+import { JSDOM } from "jsdom";
+import { toCamelCase, toPascalSnakeCase, toTitleCase } from "../helpers/strings.js";
+import { checkGenderAndType } from "./check-gender.js";
+import { fetchNames, INVALID_URL } from "./fetch-names.js";
+import { showHelpText } from "./help-message.js";

 /**
 * @packageDocumentation
 * This script will scrape Bulbapedia for the English names of a given trainer class,
 * outputting them as JSON.
- * Usage:
+ * Usage: `pnpm scrape-trainers`
 */

 /**
- * Scrape the requested trainer names and format the resultant output.
- * @param {...string} classes The names of the trainer classes to retrieve
- * @returns {Promise<string>} A Promise that resolves with the finished text.
+ * @import { parsedNames } from "./types.js"
 */
-async function scrapeTrainerNames(...classes) {
-  /**
-   * A large object mapping each class to their corresponding list of trainer names. \
-   * Trainer classes with only 1 gender will only contain the single array for that gender.
-   * @type {Record<string, string[] | parsedNames>}
-   */
-  const nameTuples = Object.fromEntries(
-    await Promise.all(
-      classes.map(async trainerClass => {
-        // Bulba URLs use Pascal_Snake_Case (Bug_Catcher)
-        const classURL = toPascalSnakeCase(trainerClass);
-        const names = await fetchNames(classURL);
-        const namesObj = names.female.length === 0 ? names.male : names;
-        return [toCamelCase(trainerClass), namesObj];
-      }),
-    ),
-  );
-  return JSON.stringify(nameTuples, null, 2);
+
+const version = "1.0.0";
+const SUPPORTED_ARGS = /** @type {const} */ (["-o", "--outfile", "--outFile"]);
+
+/**
+ * A large object mapping each "base" trainer name to a list of replacements.
+ * Used to allow for trainer classes with different `TrainerType`s than in mainline.
+ * @type {Record<string, string[]>}
+ */
+const trainerNamesMap = {
+  pokemonBreeder: ["breeder"],
+  worker: ["worker", "snowWorker"],
+  richBoy: ["richKid"],
+  gentleman: ["rich"],
+};
+
+async function main() {
+  console.log(chalk.hex("#FF7F50")(`🍳 Trainer Name Scraper v${version}`));
+
+  const args = process.argv.slice(2);
+  const out = getOutfile(args);
+  // Break out if no args remain
+  if (args.length === 0) {
+    console.error(
+      chalk.red.bold(
+        `✗ Error: No trainer classes provided!\nArgs: ${chalk.hex("#7310fdff")(process.argv.slice(2).join(", "))}`,
+      ),
+    );
+    showHelpText();
+    process.exitCode = 1;
+    return;
+  }
+
+  const output = await scrapeTrainerNames(args);
+  await tryWriteFile(out, output);
 }

-console.log(await scrapeTrainerNames("doctor"));
+/**
+ * Get the outfile location from the args array.
+ * @param {string[]} args - The command line arguments
+ * @returns {string | undefined} The outfile location, or `undefined` if none is provided
+ * @remarks
+ * This will mutate the `args` array by removing the outfile from the list of arguments.
+ */
+function getOutfile(args) {
+  let /** @type {string} */ outFile;
+  // Extract the argument as either the form "x=y" or "x y".
+  const hasEquals = args[0]?.match(/^(.*)=(.*)$/g);
+  if (hasEquals) {
+    outFile = hasEquals[2];
+    args.splice(0, 1);
+  } else if (/** @type {readonly string[]} */ (SUPPORTED_ARGS).includes(args[0])) {
+    outFile = args[1];
+    args.splice(0, 2);
+  } else {
+    console.log(chalk.hex("#ffa500")("No outfile detected, logging to stdout..."));
+    return;
+  }
+
+  console.log(chalk.hex("#ffa500")(`Using outfile: ${chalk.blue(outFile)}`));
+  return outFile;
+}
+
+/**
+ * Scrape the requested trainer names and format the resultant output.
+ * @param {string[]} classes The names of the trainer classes to retrieve
+ * @returns {Promise<string>} A Promise that resolves with the finished text.
+ */
+async function scrapeTrainerNames(classes) {
+  classes = [...new Set(classes)];
+
+  /**
+   * A Set containing all trainer URLs that have been seen.
+   * @type {Set<string>}
+   */
+  const seenClasses = new Set();
+
+  /**
+   * A large array of tuples matching each class to their corresponding list of trainer names. \
+   * Trainer classes with only 1 gender will only contain the single array for that gender.
+   * @type {[keyName: string, names: string[] | parsedNames][]}
+   */
+  const namesTuples = await Promise.all(
+    classes.map(async trainerClass => {
+      const [trainerName, names] = await doFetch(trainerClass, seenClasses);
+      const namesObj = names.female.length === 0 ? names.male : names;
+      return /** @type {const} */ ([trainerName, namesObj]);
+    }),
+  );
+
+  // Grab all keys inside the name replacement map and change them accordingly.
+  const mappedNames = namesTuples.filter(tuple => tuple[0] in trainerNamesMap);
+  for (const nameTuple of mappedNames) {
+    const namesMapping = trainerNamesMap[nameTuple[0]];
+    namesTuples.splice(
+      namesTuples.indexOf(nameTuple),
+      1,
+      ...namesMapping.map(
+        name => /** @type {[keyName: string, names: parsedNames | string[]]} */ ([name, nameTuple[1]]),
+      ),
+    );
+  }
+
+  namesTuples.sort((a, b) => a[0].localeCompare(b[0]));
+
+  /** @type {Record<string, string[] | parsedNames>} */
+  const namesRecord = Object.fromEntries(namesTuples);
+
+  // Convert all arrays into objects indexed by the number
+  return JSON.stringify(
+    namesRecord,
+    (_, v) => {
+      if (Array.isArray(v)) {
+        return v.reduce((ret, curr, i) => {
+          ret[i + 1] = curr; // 1 indexed
+          return ret;
+        }, {});
+      }
+      return v;
+    },
+    2,
+  );
+}
+
+/**
+ * Recursively scrape names from a given Trainer class and its gender counterparts.
+ * @param {string} trainerClass - The URL to parse
+ * @param {Set<string>} seenClasses - A Set containing all seen class URLs, used for record keeping.
+ * @returns {Promise<[string, parsedNames]>}
+ * A Promise that resolves with:
+ * 1. The name to use for the key.
+ * 2. All fetched names for this trainer class and its gender variants.
+ */
+async function doFetch(trainerClass, seenClasses) {
+  let keyName = toCamelCase(trainerClass);
+  const classURL = toPascalSnakeCase(trainerClass);
+  seenClasses.add(classURL);
+
+  const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${classURL}_(Trainer_class)`))
+    .window;
+  const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement;
+  const [female, counterpartURLs] = checkGenderAndType(document);
+  const names = fetchNames(trainerListHeader, female);
+  if (names === INVALID_URL) {
+    return Promise.reject(chalk.red.bold(`URL ${classURL} did not correspond to a valid trainer class!`));
+  }
+
+  // Recurse into all unseen gender counterparts' URLs, using the first male name we find
+  const counterpartNames = await Promise.all(
+    counterpartURLs
+      .filter(url => !seenClasses.has(url))
+      .map(counterpartURL => {
+        console.log(chalk.green(`Accessing gender counterpart URL: ${toTitleCase(counterpartURL)}`));
+        return doFetch(counterpartURL, seenClasses);
+      }),
+  );
+  let overrodeName = false;
+  for (const [cKeyName, cNameObj] of counterpartNames) {
+    if (!overrodeName && female) {
+      overrodeName = true;
+      console.log(chalk.green(`Using "${cKeyName}" as the name of the JSON key object...`));
+      keyName = cKeyName;
+    }
+    names.male = [...new Set(names.male.concat(cNameObj.male))];
+    names.female = [...new Set(names.female.concat(cNameObj.female))];
+  }
+  return [normalizeDiacritics(keyName), names];
+}
+
+/**
+ * Convert all diacritical marks within a string into their normalized variants.
+ * @param {string} str - The string to parse
+ * @returns {string} The string with normalized diacritics
+ */
+function normalizeDiacritics(str) {
+  // Normalizing to NFKD splits all diacritics into the base letter + grapheme (à -> a + `),
+  // which are conveniently all in their own little Unicode block for easy removal
+  return str.normalize("NFKD").replace(/[\u0300-\u036f]/g, "");
+}
+
+/**
+ * Try to write the output to a file (or log it to stdout, as the case may be).
+ * @param {string | undefined} outFile - The outfile
+ * @param {string} output - The scraped output to produce
+ */
+async function tryWriteFile(outFile, output) {
+  if (!outFile) {
+    console.log(output);
+    return;
+  }
+
+  if (existsSync(outFile) && !(await promptExisting(outFile))) {
+    process.exitCode = 1;
+    return;
+  }
+
+  try {
+    writeFileSync(outFile, output);
+    console.log(chalk.green.bold(`✔ Output written to ${chalk.blue(outFile)} successfully!`));
+  } catch (e) {
+    let /** @type {string} */ errStr;
+    if (!(e instanceof Error)) {
+      errStr = format("Unknown error occurred: ", e);
+    } else {
+      // @ts-expect-error - Node.JS file errors always have codes
+      switch (e.code) {
+        case "ENOENT":
+          errStr = `File not found: ${outFile}`;
+          break;
+        case "EACCES":
+          errStr = `Could not write ${outFile}: Permission denied`;
+          break;
+        case "EISDIR":
+          errStr = `Unable to write to ${outFile} as it is a directory`;
+          break;
+        default:
+          errStr = `Error writing file: ${e.message}`;
+      }
+    }
+    console.error(chalk.red.bold(errStr));
+    process.exitCode = 1;
+    return;
+  }
+}
+
+/**
+ * Confirm overwriting an already-existing file.
+ * @param {string} outFile - The outfile
+ * @returns {Promise<boolean>} Whether "Yes" or "No" was selected.
+ */
+async function promptExisting(outFile) {
+  return (
+    await inquirer.prompt([
+      {
+        type: "confirm",
+        name: "continue",
+        message: `File ${chalk.blue(outFile)} already exists!` + "\nDo you want to replace it?",
+        default: false,
+      },
+    ])
+  ).continue;
+}
+
+main();
--- a/scripts/scrape-trainer-names/types.js
+++ b/scripts/scrape-trainer-names/types.js
@ -5,3 +5,5 @@
 * @property {string[]} male
 * @property {string[]} female
 */
+
+export {};