mirror of
https://github.com/pagefaultgames/pokerogue.git
synced 2025-08-19 22:09:27 +02:00
Made script actually work
This commit is contained in:
parent
43e9d82b26
commit
78efc5d130
@ -1,45 +1,53 @@
|
||||
/**
|
||||
* Check if the given trainer class is female.
|
||||
* @param {Document} document - The HTML document to scrape
|
||||
* @returns {[gender: boolean, counterpartURL?: string]} A 2-length tuple containing:
|
||||
* 1. The trainer class' normal gender
|
||||
* 2. A URL to the gender counterpart of the current class (if the trainer has one).
|
||||
* @returns {[gender: boolean, counterpartURLs: string[]]} A 2-length tuple containing:
|
||||
* 1. The trainer class' gender (female or not)
|
||||
* 2. A list of all the current class' opposite-gender counterparts (if the trainer has any).
|
||||
*/
|
||||
export function checkGenderAndType(document) {
|
||||
const infoBox = document.getElementById("infobox");
|
||||
const infoBox = document.getElementsByClassName("infobox")[0];
|
||||
if (!infoBox) {
|
||||
return [false];
|
||||
return [false, []];
|
||||
}
|
||||
// Find the row of the table containing the specified gender
|
||||
const children = [...infoBox.childNodes];
|
||||
const genderCell = children.find(
|
||||
node => node.nodeName === "tr" && [...node.childNodes].some(c => c.textContent?.includes("Gender")),
|
||||
)?.parentElement;
|
||||
if (!genderCell) {
|
||||
return [false];
|
||||
const children = [...infoBox.getElementsByTagName("tr")];
|
||||
const genderCell = children.find(node => [...node.childNodes].some(c => c.textContent?.includes("Gender")));
|
||||
const tableBox = genderCell?.querySelector("td");
|
||||
if (!tableBox) {
|
||||
return [false, []];
|
||||
}
|
||||
|
||||
const gender = getGender(genderCell.querySelector("tr"));
|
||||
const hrefExtractRegex = /href="\/wiki\/(.*)_(Trainer_class)"/g;
|
||||
const counterpartURL = genderCell.querySelector("td")?.getHTML().match(hrefExtractRegex)?.[1];
|
||||
const gender = getGender(tableBox);
|
||||
|
||||
return [gender, counterpartURL];
|
||||
// CHeck the cell's inner HTML for any `href`s to gender counterparts and scrape them too
|
||||
const hrefExtractRegex = /href="\/wiki\/(.*?)_\(Trainer_class\)"/g;
|
||||
const counterpartCell = children.find(node => [...node.childNodes].some(c => c.textContent?.includes("Counterpart")));
|
||||
|
||||
const counterpartURLs = [];
|
||||
for (const url of counterpartCell?.innerHTML?.matchAll(hrefExtractRegex) ?? []) {
|
||||
counterpartURLs.push(url[1]);
|
||||
}
|
||||
|
||||
return [gender, counterpartURLs];
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the gender from the given node text.
|
||||
* @param {HTMLTableRowElement?} genderCell - The cell to check
|
||||
* @param {HTMLTableCellElement} genderCell - The cell to check
|
||||
* @returns {boolean} The gender type
|
||||
* @todo Handle trainers whose gender type has changed across different gens (Artists, etc.)
|
||||
*/
|
||||
function getGender(genderCell) {
|
||||
switch (genderCell?.textContent) {
|
||||
case "Female Only":
|
||||
return false;
|
||||
case "Male Only":
|
||||
case "Both":
|
||||
const gender = genderCell.textContent?.trim().toLowerCase() ?? "";
|
||||
|
||||
switch (gender) {
|
||||
case "female only":
|
||||
return true;
|
||||
case "male only":
|
||||
case "both":
|
||||
case undefined:
|
||||
default:
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1,42 +1,28 @@
|
||||
import chalk from "chalk";
|
||||
import { JSDOM } from "jsdom";
|
||||
import { checkGenderAndType } from "./check-gender.js";
|
||||
|
||||
/**
|
||||
* @import { nameRecord, parsedNames } from "./types.js";
|
||||
* @import { parsedNames } from "./types.js";
|
||||
*/
|
||||
|
||||
/**
|
||||
* Fetch a given trainer's names from the given URL.
|
||||
* @param {string} url - The URL to parse
|
||||
* @param {boolean} [currGender] - The current class' known gender.
|
||||
* If provided, will override the natural gender detection with the given gender and avoid
|
||||
* checking any gender counterparts.
|
||||
* @returns {Promise<parsedNames>} A Promise that resolves with the parsed names once the parsing concludes.
|
||||
* Will resolve with an empty array if the name could not be parsed.
|
||||
* An error code for a bad URL.
|
||||
*/
|
||||
export async function fetchNames(url, currGender) {
|
||||
const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${url}_(Trainer_class)`)).window;
|
||||
const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement;
|
||||
export const INVALID_URL = "bad_url_code";
|
||||
|
||||
/** @type {const} */
|
||||
|
||||
/**
|
||||
* Fetch a given trainer's names from the given HTML document.
|
||||
* @param {HTMLElement | null | undefined} trainerListHeader - The header containing the trainer lists
|
||||
* @param {boolean} [knownFemale=false] - Whether the class is known to be female; default `false`
|
||||
* @returns {parsedNames | INVALID_URL}
|
||||
* An object containing the parsed names. \
|
||||
* Will instead return with {@linkcode INVALID_URL} if the data is invalid.
|
||||
*/
|
||||
export function fetchNames(trainerListHeader, knownFemale = false) {
|
||||
const trainerNames = /** @type {Set<string>} */ (new Set());
|
||||
const femaleTrainerNames = /** @type {Set<string>} */ (new Set());
|
||||
if (!trainerListHeader?.parentElement?.childNodes) {
|
||||
console.warn(chalk.hex("#ffa500")(`URL ${url} did not correspond to a valid trainer class!`));
|
||||
return { male: [], female: [] };
|
||||
}
|
||||
|
||||
let trainerNames = /** @type {Set<string>} */ (new Set());
|
||||
let femaleTrainerNames = /** @type {Set<string>} */ (new Set());
|
||||
|
||||
// If we don't know whether this class is female, check, optionally recursing into the counterpart's webpage as well.
|
||||
if (currGender === undefined) {
|
||||
/** @type {string | undefined} */
|
||||
let counterpartURL;
|
||||
[currGender, counterpartURL] = checkGenderAndType(document);
|
||||
if (counterpartURL) {
|
||||
console.log(chalk.green(`Accessing gender counterpart URL: ${counterpartURL}`));
|
||||
const names = await fetchNames(counterpartURL, !currGender);
|
||||
trainerNames = new Set(names.male);
|
||||
femaleTrainerNames = new Set(names.female);
|
||||
}
|
||||
// Return early if no child nodes (ie tables) can be found
|
||||
return INVALID_URL;
|
||||
}
|
||||
|
||||
const elements = [...trainerListHeader.parentElement.childNodes];
|
||||
@ -55,7 +41,7 @@ export async function fetchNames(url, currGender) {
|
||||
),
|
||||
);
|
||||
|
||||
parseTable(tables, currGender, trainerNames, femaleTrainerNames);
|
||||
parseTable(tables, knownFemale, trainerNames, femaleTrainerNames);
|
||||
return {
|
||||
male: Array.from(trainerNames),
|
||||
female: Array.from(femaleTrainerNames),
|
||||
|
16
scripts/scrape-trainer-names/help-message.js
Normal file
16
scripts/scrape-trainer-names/help-message.js
Normal file
@ -0,0 +1,16 @@
|
||||
import chalk from "chalk";
|
||||
|
||||
/** Show help/usage text for the `scrape-trainers` CLI. */
|
||||
export function showHelpText() {
|
||||
console.log(`
|
||||
Usage: ${chalk.cyan("pnpm scrape-trainers [options] <names>")}
|
||||
Note that all option names are ${chalk.bold("case insensitive")}.
|
||||
|
||||
${chalk.hex("#8a2be2")("Arguments:")}
|
||||
${chalk.hex("#7fff00")("names")} The name of one or more trainer classes to parse.
|
||||
|
||||
${chalk.hex("#ffa500")("Options:")}
|
||||
${chalk.blue("-h, --help")} Show this help message.
|
||||
${chalk.blue("-o, --out, --outfile")} The path to a file to save the output. If not provided, will send directly to stdout.
|
||||
`);
|
||||
}
|
@ -1,36 +1,264 @@
|
||||
import { toCamelCase, toPascalSnakeCase } from "../helpers/strings.js";
|
||||
import { fetchNames } from "./fetch-names.js";
|
||||
import { existsSync, writeFileSync } from "node:fs";
|
||||
import { format } from "node:util";
|
||||
import chalk from "chalk";
|
||||
import inquirer from "inquirer";
|
||||
import { JSDOM } from "jsdom";
|
||||
import { toCamelCase, toPascalSnakeCase, toTitleCase } from "../helpers/strings.js";
|
||||
import { checkGenderAndType } from "./check-gender.js";
|
||||
import { fetchNames, INVALID_URL } from "./fetch-names.js";
|
||||
import { showHelpText } from "./help-message.js";
|
||||
|
||||
/**
|
||||
* @packageDocumentation
|
||||
* This script will scrape Bulbapedia for the English names of a given trainer class,
|
||||
* outputting them as JSON.
|
||||
* Usage:
|
||||
* Usage: `pnpm scrape-trainers`
|
||||
*/
|
||||
|
||||
/**
|
||||
* Scrape the requested trainer names and format the resultant output.
|
||||
* @param {...string} classes The names of the trainer classes to retrieve
|
||||
* @returns {Promise<string>} A Promise that resolves with the finished text.
|
||||
* @import { parsedNames } from "./types.js"
|
||||
*/
|
||||
async function scrapeTrainerNames(...classes) {
|
||||
/**
|
||||
* A large object mapping each class to their corresponding list of trainer names. \
|
||||
* Trainer classes with only 1 gender will only contain the single array for that gender.
|
||||
* @type {Record<string, string[] | parsedNames>}
|
||||
*/
|
||||
const nameTuples = Object.fromEntries(
|
||||
await Promise.all(
|
||||
classes.map(async trainerClass => {
|
||||
// Bulba URLs use Pascal_Snake_Case (Bug_Catcher)
|
||||
const classURL = toPascalSnakeCase(trainerClass);
|
||||
const names = await fetchNames(classURL);
|
||||
const namesObj = names.female.length === 0 ? names.male : names;
|
||||
return [toCamelCase(trainerClass), namesObj];
|
||||
}),
|
||||
),
|
||||
);
|
||||
return JSON.stringify(nameTuples, null, 2);
|
||||
|
||||
const version = "1.0.0";
|
||||
const SUPPORTED_ARGS = /** @type {const} */ (["-o", "--outfile", "--outFile"]);
|
||||
|
||||
/**
|
||||
* A large object mapping each "base" trainer name to a list of replacements.
|
||||
* Used to allow for trainer classes with different `TrainerType`s than in mainline.
|
||||
* @type {Record<string, string[]>}
|
||||
*/
|
||||
const trainerNamesMap = {
|
||||
pokemonBreeder: ["breeder"],
|
||||
worker: ["worker", "snowWorker"],
|
||||
richBoy: ["richKid"],
|
||||
gentleman: ["rich"],
|
||||
};
|
||||
|
||||
async function main() {
|
||||
console.log(chalk.hex("#FF7F50")(`🍳 Trainer Name Scraper v${version}`));
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const out = getOutfile(args);
|
||||
// Break out if no args remain
|
||||
if (args.length === 0) {
|
||||
console.error(
|
||||
chalk.red.bold(
|
||||
`✗ Error: No trainer classes provided!\nArgs: ${chalk.hex("#7310fdff")(process.argv.slice(2).join(", "))}`,
|
||||
),
|
||||
);
|
||||
showHelpText();
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
const output = await scrapeTrainerNames(args);
|
||||
await tryWriteFile(out, output);
|
||||
}
|
||||
|
||||
console.log(await scrapeTrainerNames("doctor"));
|
||||
/**
|
||||
* Get the outfile location from the args array.
|
||||
* @param {string[]} args - The command line arguments
|
||||
* @returns {string | undefined} The outfile location, or `undefined` if none is provided
|
||||
* @remarks
|
||||
* This will mutate the `args` array by removing the outfile from the list of arguments.
|
||||
*/
|
||||
function getOutfile(args) {
|
||||
let /** @type {string} */ outFile;
|
||||
// Extract the argument as either the form "x=y" or "x y".
|
||||
const hasEquals = args[0]?.match(/^(.*)=(.*)$/g);
|
||||
if (hasEquals) {
|
||||
outFile = hasEquals[2];
|
||||
args.splice(0, 1);
|
||||
} else if (/** @type {readonly string[]} */ (SUPPORTED_ARGS).includes(args[0])) {
|
||||
outFile = args[1];
|
||||
args.splice(0, 2);
|
||||
} else {
|
||||
console.log(chalk.hex("#ffa500")("No outfile detected, logging to stdout..."));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(chalk.hex("#ffa500")(`Using outfile: ${chalk.blue(outFile)}`));
|
||||
return outFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape the requested trainer names and format the resultant output.
|
||||
* @param {string[]} classes The names of the trainer classes to retrieve
|
||||
* @returns {Promise<string>} A Promise that resolves with the finished text.
|
||||
*/
|
||||
async function scrapeTrainerNames(classes) {
|
||||
classes = [...new Set(classes)];
|
||||
|
||||
/**
|
||||
* A Set containing all trainer URLs that have been seen.
|
||||
* @type {Set<string>}
|
||||
*/
|
||||
const seenClasses = new Set();
|
||||
|
||||
/**
|
||||
* A large array of tuples matching each class to their corresponding list of trainer names. \
|
||||
* Trainer classes with only 1 gender will only contain the single array for that gender.
|
||||
* @type {[keyName: string, names: string[] | parsedNames][]}
|
||||
*/
|
||||
const namesTuples = await Promise.all(
|
||||
classes.map(async trainerClass => {
|
||||
const [trainerName, names] = await doFetch(trainerClass, seenClasses);
|
||||
const namesObj = names.female.length === 0 ? names.male : names;
|
||||
return /** @type {const} */ ([trainerName, namesObj]);
|
||||
}),
|
||||
);
|
||||
|
||||
// Grab all keys inside the name replacement map and change them accordingly.
|
||||
const mappedNames = namesTuples.filter(tuple => tuple[0] in trainerNamesMap);
|
||||
for (const nameTuple of mappedNames) {
|
||||
const namesMapping = trainerNamesMap[nameTuple[0]];
|
||||
namesTuples.splice(
|
||||
namesTuples.indexOf(nameTuple),
|
||||
1,
|
||||
...namesMapping.map(
|
||||
name => /** @type {[keyName: string, names: parsedNames | string[]]} */ ([name, nameTuple[1]]),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
namesTuples.sort((a, b) => a[0].localeCompare(b[0]));
|
||||
|
||||
/** @type {Record<string, string[] | parsedNames>} */
|
||||
const namesRecord = Object.fromEntries(namesTuples);
|
||||
|
||||
// Convert all arrays into objects indexed by the number
|
||||
return JSON.stringify(
|
||||
namesRecord,
|
||||
(_, v) => {
|
||||
if (Array.isArray(v)) {
|
||||
return v.reduce((ret, curr, i) => {
|
||||
ret[i + 1] = curr; // 1 indexed
|
||||
return ret;
|
||||
}, {});
|
||||
}
|
||||
return v;
|
||||
},
|
||||
2,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively scrape names from a given Trainer class and its gender counterparts.
|
||||
* @param {string} trainerClass - The URL to parse
|
||||
* @param {Set<string>} seenClasses - A Set containing all seen class URLs, used for record keeping.
|
||||
* @returns {Promise<[string, parsedNames]>}
|
||||
* A Promise that resolves with:
|
||||
* 1. The name to use for the key.
|
||||
* 2. All fetched names for this trainer class and its gender variants.
|
||||
*/
|
||||
async function doFetch(trainerClass, seenClasses) {
|
||||
let keyName = toCamelCase(trainerClass);
|
||||
const classURL = toPascalSnakeCase(trainerClass);
|
||||
seenClasses.add(classURL);
|
||||
|
||||
const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${classURL}_(Trainer_class)`))
|
||||
.window;
|
||||
const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement;
|
||||
const [female, counterpartURLs] = checkGenderAndType(document);
|
||||
const names = fetchNames(trainerListHeader, female);
|
||||
if (names === INVALID_URL) {
|
||||
return Promise.reject(chalk.red.bold(`URL ${classURL} did not correspond to a valid trainer class!`));
|
||||
}
|
||||
|
||||
// Recurse into all unseen gender counterparts' URLs, using the first male name we find
|
||||
const counterpartNames = await Promise.all(
|
||||
counterpartURLs
|
||||
.filter(url => !seenClasses.has(url))
|
||||
.map(counterpartURL => {
|
||||
console.log(chalk.green(`Accessing gender counterpart URL: ${toTitleCase(counterpartURL)}`));
|
||||
return doFetch(counterpartURL, seenClasses);
|
||||
}),
|
||||
);
|
||||
let overrodeName = false;
|
||||
for (const [cKeyName, cNameObj] of counterpartNames) {
|
||||
if (!overrodeName && female) {
|
||||
overrodeName = true;
|
||||
console.log(chalk.green(`Using "${cKeyName}" as the name of the JSON key object...`));
|
||||
keyName = cKeyName;
|
||||
}
|
||||
names.male = [...new Set(names.male.concat(cNameObj.male))];
|
||||
names.female = [...new Set(names.female.concat(cNameObj.female))];
|
||||
}
|
||||
return [normalizeDiacritics(keyName), names];
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert all diacritical marks within a string into their normalized variants.
|
||||
* @param {string} str - The string to parse
|
||||
* @returns {string} The string with normalized diacritics
|
||||
*/
|
||||
function normalizeDiacritics(str) {
|
||||
// Normalizing to NFKD splits all diacritics into the base letter + grapheme (à -> a + `),
|
||||
// which are conveniently all in their own little Unicode block for easy removal
|
||||
return str.normalize("NFKD").replace(/[\u0300-\u036f]/g, "");
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to write the output to a file (or log it to stdout, as the case may be).
|
||||
* @param {string | undefined} outFile - The outfile
|
||||
* @param {string} output - The scraped output to produce
|
||||
*/
|
||||
async function tryWriteFile(outFile, output) {
|
||||
if (!outFile) {
|
||||
console.log(output);
|
||||
return;
|
||||
}
|
||||
|
||||
if (existsSync(outFile) && !(await promptExisting(outFile))) {
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
writeFileSync(outFile, output);
|
||||
console.log(chalk.green.bold(`✔ Output written to ${chalk.blue(outFile)} successfully!`));
|
||||
} catch (e) {
|
||||
let /** @type {string} */ errStr;
|
||||
if (!(e instanceof Error)) {
|
||||
errStr = format("Unknown error occurred: ", e);
|
||||
} else {
|
||||
// @ts-expect-error - Node.JS file errors always have codes
|
||||
switch (e.code) {
|
||||
case "ENOENT":
|
||||
errStr = `File not found: ${outFile}`;
|
||||
break;
|
||||
case "EACCES":
|
||||
errStr = `Could not write ${outFile}: Permission denied`;
|
||||
break;
|
||||
case "EISDIR":
|
||||
errStr = `Unable to write to ${outFile} as it is a directory`;
|
||||
break;
|
||||
default:
|
||||
errStr = `Error writing file: ${e.message}`;
|
||||
}
|
||||
}
|
||||
console.error(chalk.red.bold(errStr));
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Confirm overwriting an already-existing file.
|
||||
* @param {string} outFile - The outfile
|
||||
* @returns {Promise<boolean>} Whether "Yes" or "No" was selected.
|
||||
*/
|
||||
async function promptExisting(outFile) {
|
||||
return (
|
||||
await inquirer.prompt([
|
||||
{
|
||||
type: "confirm",
|
||||
name: "continue",
|
||||
message: `File ${chalk.blue(outFile)} already exists!` + "\nDo you want to replace it?",
|
||||
default: false,
|
||||
},
|
||||
])
|
||||
).continue;
|
||||
}
|
||||
|
||||
main();
|
||||
|
@ -5,3 +5,5 @@
|
||||
* @property {string[]} male
|
||||
* @property {string[]} female
|
||||
*/
|
||||
|
||||
export {};
|
||||
|
Loading…
Reference in New Issue
Block a user