Made script actually work

This commit is contained in:
Bertie690 2025-08-18 00:36:44 -04:00
parent 43e9d82b26
commit 78efc5d130
5 changed files with 321 additions and 81 deletions

View File

@ -1,45 +1,53 @@
/**
* Check if the given trainer class is female.
* @param {Document} document - The HTML document to scrape
* @returns {[gender: boolean, counterpartURL?: string]} A 2-length tuple containing:
* 1. The trainer class' normal gender
* 2. A URL to the gender counterpart of the current class (if the trainer has one).
* @returns {[gender: boolean, counterpartURLs: string[]]} A 2-length tuple containing:
* 1. The trainer class' gender (female or not)
* 2. A list of all the current class' opposite-gender counterparts (if the trainer has any).
*/
export function checkGenderAndType(document) {
const infoBox = document.getElementById("infobox");
const infoBox = document.getElementsByClassName("infobox")[0];
if (!infoBox) {
return [false];
return [false, []];
}
// Find the row of the table containing the specified gender
const children = [...infoBox.childNodes];
const genderCell = children.find(
node => node.nodeName === "tr" && [...node.childNodes].some(c => c.textContent?.includes("Gender")),
)?.parentElement;
if (!genderCell) {
return [false];
const children = [...infoBox.getElementsByTagName("tr")];
const genderCell = children.find(node => [...node.childNodes].some(c => c.textContent?.includes("Gender")));
const tableBox = genderCell?.querySelector("td");
if (!tableBox) {
return [false, []];
}
const gender = getGender(genderCell.querySelector("tr"));
const hrefExtractRegex = /href="\/wiki\/(.*)_(Trainer_class)"/g;
const counterpartURL = genderCell.querySelector("td")?.getHTML().match(hrefExtractRegex)?.[1];
const gender = getGender(tableBox);
return [gender, counterpartURL];
// CHeck the cell's inner HTML for any `href`s to gender counterparts and scrape them too
const hrefExtractRegex = /href="\/wiki\/(.*?)_\(Trainer_class\)"/g;
const counterpartCell = children.find(node => [...node.childNodes].some(c => c.textContent?.includes("Counterpart")));
const counterpartURLs = [];
for (const url of counterpartCell?.innerHTML?.matchAll(hrefExtractRegex) ?? []) {
counterpartURLs.push(url[1]);
}
return [gender, counterpartURLs];
}
/**
* Retrieve the gender from the given node text.
* @param {HTMLTableRowElement?} genderCell - The cell to check
* @param {HTMLTableCellElement} genderCell - The cell to check
* @returns {boolean} The gender type
* @todo Handle trainers whose gender type has changed across different gens (Artists, etc.)
*/
function getGender(genderCell) {
switch (genderCell?.textContent) {
case "Female Only":
return false;
case "Male Only":
case "Both":
const gender = genderCell.textContent?.trim().toLowerCase() ?? "";
switch (gender) {
case "female only":
return true;
case "male only":
case "both":
case undefined:
default:
return true;
return false;
}
}

View File

@ -1,42 +1,28 @@
import chalk from "chalk";
import { JSDOM } from "jsdom";
import { checkGenderAndType } from "./check-gender.js";
/**
* @import { nameRecord, parsedNames } from "./types.js";
* @import { parsedNames } from "./types.js";
*/
/**
* Fetch a given trainer's names from the given URL.
* @param {string} url - The URL to parse
* @param {boolean} [currGender] - The current class' known gender.
* If provided, will override the natural gender detection with the given gender and avoid
* checking any gender counterparts.
* @returns {Promise<parsedNames>} A Promise that resolves with the parsed names once the parsing concludes.
* Will resolve with an empty array if the name could not be parsed.
* An error code for a bad URL.
*/
export async function fetchNames(url, currGender) {
const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${url}_(Trainer_class)`)).window;
const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement;
export const INVALID_URL = "bad_url_code";
/** @type {const} */
/**
* Fetch a given trainer's names from the given HTML document.
* @param {HTMLElement | null | undefined} trainerListHeader - The header containing the trainer lists
* @param {boolean} [knownFemale=false] - Whether the class is known to be female; default `false`
* @returns {parsedNames | INVALID_URL}
* An object containing the parsed names. \
* Will instead return with {@linkcode INVALID_URL} if the data is invalid.
*/
export function fetchNames(trainerListHeader, knownFemale = false) {
const trainerNames = /** @type {Set<string>} */ (new Set());
const femaleTrainerNames = /** @type {Set<string>} */ (new Set());
if (!trainerListHeader?.parentElement?.childNodes) {
console.warn(chalk.hex("#ffa500")(`URL ${url} did not correspond to a valid trainer class!`));
return { male: [], female: [] };
}
let trainerNames = /** @type {Set<string>} */ (new Set());
let femaleTrainerNames = /** @type {Set<string>} */ (new Set());
// If we don't know whether this class is female, check, optionally recursing into the counterpart's webpage as well.
if (currGender === undefined) {
/** @type {string | undefined} */
let counterpartURL;
[currGender, counterpartURL] = checkGenderAndType(document);
if (counterpartURL) {
console.log(chalk.green(`Accessing gender counterpart URL: ${counterpartURL}`));
const names = await fetchNames(counterpartURL, !currGender);
trainerNames = new Set(names.male);
femaleTrainerNames = new Set(names.female);
}
// Return early if no child nodes (ie tables) can be found
return INVALID_URL;
}
const elements = [...trainerListHeader.parentElement.childNodes];
@ -55,7 +41,7 @@ export async function fetchNames(url, currGender) {
),
);
parseTable(tables, currGender, trainerNames, femaleTrainerNames);
parseTable(tables, knownFemale, trainerNames, femaleTrainerNames);
return {
male: Array.from(trainerNames),
female: Array.from(femaleTrainerNames),

View File

@ -0,0 +1,16 @@
import chalk from "chalk";
/** Show help/usage text for the `scrape-trainers` CLI. */
export function showHelpText() {
console.log(`
Usage: ${chalk.cyan("pnpm scrape-trainers [options] <names>")}
Note that all option names are ${chalk.bold("case insensitive")}.
${chalk.hex("#8a2be2")("Arguments:")}
${chalk.hex("#7fff00")("names")} The name of one or more trainer classes to parse.
${chalk.hex("#ffa500")("Options:")}
${chalk.blue("-h, --help")} Show this help message.
${chalk.blue("-o, --out, --outfile")} The path to a file to save the output. If not provided, will send directly to stdout.
`);
}

View File

@ -1,36 +1,264 @@
import { toCamelCase, toPascalSnakeCase } from "../helpers/strings.js";
import { fetchNames } from "./fetch-names.js";
import { existsSync, writeFileSync } from "node:fs";
import { format } from "node:util";
import chalk from "chalk";
import inquirer from "inquirer";
import { JSDOM } from "jsdom";
import { toCamelCase, toPascalSnakeCase, toTitleCase } from "../helpers/strings.js";
import { checkGenderAndType } from "./check-gender.js";
import { fetchNames, INVALID_URL } from "./fetch-names.js";
import { showHelpText } from "./help-message.js";
/**
* @packageDocumentation
* This script will scrape Bulbapedia for the English names of a given trainer class,
* outputting them as JSON.
* Usage:
* Usage: `pnpm scrape-trainers`
*/
/**
* Scrape the requested trainer names and format the resultant output.
* @param {...string} classes The names of the trainer classes to retrieve
* @returns {Promise<string>} A Promise that resolves with the finished text.
* @import { parsedNames } from "./types.js"
*/
async function scrapeTrainerNames(...classes) {
/**
* A large object mapping each class to their corresponding list of trainer names. \
* Trainer classes with only 1 gender will only contain the single array for that gender.
* @type {Record<string, string[] | parsedNames>}
*/
const nameTuples = Object.fromEntries(
await Promise.all(
classes.map(async trainerClass => {
// Bulba URLs use Pascal_Snake_Case (Bug_Catcher)
const classURL = toPascalSnakeCase(trainerClass);
const names = await fetchNames(classURL);
const namesObj = names.female.length === 0 ? names.male : names;
return [toCamelCase(trainerClass), namesObj];
}),
),
);
return JSON.stringify(nameTuples, null, 2);
const version = "1.0.0";
const SUPPORTED_ARGS = /** @type {const} */ (["-o", "--outfile", "--outFile"]);
/**
* A large object mapping each "base" trainer name to a list of replacements.
* Used to allow for trainer classes with different `TrainerType`s than in mainline.
* @type {Record<string, string[]>}
*/
const trainerNamesMap = {
pokemonBreeder: ["breeder"],
worker: ["worker", "snowWorker"],
richBoy: ["richKid"],
gentleman: ["rich"],
};
async function main() {
console.log(chalk.hex("#FF7F50")(`🍳 Trainer Name Scraper v${version}`));
const args = process.argv.slice(2);
const out = getOutfile(args);
// Break out if no args remain
if (args.length === 0) {
console.error(
chalk.red.bold(
`✗ Error: No trainer classes provided!\nArgs: ${chalk.hex("#7310fdff")(process.argv.slice(2).join(", "))}`,
),
);
showHelpText();
process.exitCode = 1;
return;
}
const output = await scrapeTrainerNames(args);
await tryWriteFile(out, output);
}
console.log(await scrapeTrainerNames("doctor"));
/**
* Get the outfile location from the args array.
* @param {string[]} args - The command line arguments
* @returns {string | undefined} The outfile location, or `undefined` if none is provided
* @remarks
* This will mutate the `args` array by removing the outfile from the list of arguments.
*/
function getOutfile(args) {
let /** @type {string} */ outFile;
// Extract the argument as either the form "x=y" or "x y".
const hasEquals = args[0]?.match(/^(.*)=(.*)$/g);
if (hasEquals) {
outFile = hasEquals[2];
args.splice(0, 1);
} else if (/** @type {readonly string[]} */ (SUPPORTED_ARGS).includes(args[0])) {
outFile = args[1];
args.splice(0, 2);
} else {
console.log(chalk.hex("#ffa500")("No outfile detected, logging to stdout..."));
return;
}
console.log(chalk.hex("#ffa500")(`Using outfile: ${chalk.blue(outFile)}`));
return outFile;
}
/**
* Scrape the requested trainer names and format the resultant output.
* @param {string[]} classes The names of the trainer classes to retrieve
* @returns {Promise<string>} A Promise that resolves with the finished text.
*/
async function scrapeTrainerNames(classes) {
classes = [...new Set(classes)];
/**
* A Set containing all trainer URLs that have been seen.
* @type {Set<string>}
*/
const seenClasses = new Set();
/**
* A large array of tuples matching each class to their corresponding list of trainer names. \
* Trainer classes with only 1 gender will only contain the single array for that gender.
* @type {[keyName: string, names: string[] | parsedNames][]}
*/
const namesTuples = await Promise.all(
classes.map(async trainerClass => {
const [trainerName, names] = await doFetch(trainerClass, seenClasses);
const namesObj = names.female.length === 0 ? names.male : names;
return /** @type {const} */ ([trainerName, namesObj]);
}),
);
// Grab all keys inside the name replacement map and change them accordingly.
const mappedNames = namesTuples.filter(tuple => tuple[0] in trainerNamesMap);
for (const nameTuple of mappedNames) {
const namesMapping = trainerNamesMap[nameTuple[0]];
namesTuples.splice(
namesTuples.indexOf(nameTuple),
1,
...namesMapping.map(
name => /** @type {[keyName: string, names: parsedNames | string[]]} */ ([name, nameTuple[1]]),
),
);
}
namesTuples.sort((a, b) => a[0].localeCompare(b[0]));
/** @type {Record<string, string[] | parsedNames>} */
const namesRecord = Object.fromEntries(namesTuples);
// Convert all arrays into objects indexed by the number
return JSON.stringify(
namesRecord,
(_, v) => {
if (Array.isArray(v)) {
return v.reduce((ret, curr, i) => {
ret[i + 1] = curr; // 1 indexed
return ret;
}, {});
}
return v;
},
2,
);
}
/**
* Recursively scrape names from a given Trainer class and its gender counterparts.
* @param {string} trainerClass - The URL to parse
* @param {Set<string>} seenClasses - A Set containing all seen class URLs, used for record keeping.
* @returns {Promise<[string, parsedNames]>}
* A Promise that resolves with:
* 1. The name to use for the key.
* 2. All fetched names for this trainer class and its gender variants.
*/
async function doFetch(trainerClass, seenClasses) {
let keyName = toCamelCase(trainerClass);
const classURL = toPascalSnakeCase(trainerClass);
seenClasses.add(classURL);
const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${classURL}_(Trainer_class)`))
.window;
const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement;
const [female, counterpartURLs] = checkGenderAndType(document);
const names = fetchNames(trainerListHeader, female);
if (names === INVALID_URL) {
return Promise.reject(chalk.red.bold(`URL ${classURL} did not correspond to a valid trainer class!`));
}
// Recurse into all unseen gender counterparts' URLs, using the first male name we find
const counterpartNames = await Promise.all(
counterpartURLs
.filter(url => !seenClasses.has(url))
.map(counterpartURL => {
console.log(chalk.green(`Accessing gender counterpart URL: ${toTitleCase(counterpartURL)}`));
return doFetch(counterpartURL, seenClasses);
}),
);
let overrodeName = false;
for (const [cKeyName, cNameObj] of counterpartNames) {
if (!overrodeName && female) {
overrodeName = true;
console.log(chalk.green(`Using "${cKeyName}" as the name of the JSON key object...`));
keyName = cKeyName;
}
names.male = [...new Set(names.male.concat(cNameObj.male))];
names.female = [...new Set(names.female.concat(cNameObj.female))];
}
return [normalizeDiacritics(keyName), names];
}
/**
* Convert all diacritical marks within a string into their normalized variants.
* @param {string} str - The string to parse
* @returns {string} The string with normalized diacritics
*/
function normalizeDiacritics(str) {
// Normalizing to NFKD splits all diacritics into the base letter + grapheme (à -> a + `),
// which are conveniently all in their own little Unicode block for easy removal
return str.normalize("NFKD").replace(/[\u0300-\u036f]/g, "");
}
/**
* Try to write the output to a file (or log it to stdout, as the case may be).
* @param {string | undefined} outFile - The outfile
* @param {string} output - The scraped output to produce
*/
async function tryWriteFile(outFile, output) {
if (!outFile) {
console.log(output);
return;
}
if (existsSync(outFile) && !(await promptExisting(outFile))) {
process.exitCode = 1;
return;
}
try {
writeFileSync(outFile, output);
console.log(chalk.green.bold(`✔ Output written to ${chalk.blue(outFile)} successfully!`));
} catch (e) {
let /** @type {string} */ errStr;
if (!(e instanceof Error)) {
errStr = format("Unknown error occurred: ", e);
} else {
// @ts-expect-error - Node.JS file errors always have codes
switch (e.code) {
case "ENOENT":
errStr = `File not found: ${outFile}`;
break;
case "EACCES":
errStr = `Could not write ${outFile}: Permission denied`;
break;
case "EISDIR":
errStr = `Unable to write to ${outFile} as it is a directory`;
break;
default:
errStr = `Error writing file: ${e.message}`;
}
}
console.error(chalk.red.bold(errStr));
process.exitCode = 1;
return;
}
}
/**
* Confirm overwriting an already-existing file.
* @param {string} outFile - The outfile
* @returns {Promise<boolean>} Whether "Yes" or "No" was selected.
*/
async function promptExisting(outFile) {
return (
await inquirer.prompt([
{
type: "confirm",
name: "continue",
message: `File ${chalk.blue(outFile)} already exists!` + "\nDo you want to replace it?",
default: false,
},
])
).continue;
}
main();

View File

@ -5,3 +5,5 @@
* @property {string[]} male
* @property {string[]} female
*/
export {};