ScottyLabs · ajxu2 · Feb 16, 2025 · Feb 16, 2025 · Feb 16, 2025 · Feb 16, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,11 @@
+# Note: include stuff from here if chromium breaks: https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#running-puppeteer-in-docker
+
 FROM oven/bun:latest
+# installs appropriate chromium binary for current architecture (x86 vs. ARM) https://github.com/cline/cline/pull/1721
+RUN apt update && apt install chromium -y
+
+ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
+ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
 
 WORKDIR /runtime
 COPY . /runtime

diff --git a/README.md b/README.md
@@ -27,15 +27,23 @@ Now install the API's dependencies by 'cd'-ing into the root of the repository a
 bun install
 ```
 
-Then, you can run the server with `bun start` and it should work! You can also use
-`bun run start` since `bun start` is its shorthand version.
+Then, you can run the server with `bun dev` and it should work! You can also use
+`bun run dev` since `bun dev` is its shorthand version.
 
 Note: To add new dependencies, use `bun add dependency-name`. To remove dependencies, use `bun remove dependency-name`. Run `bun outdated` to see what dependencies are outdated and `bun update` to update all outdated dependencies to the latest version.
 
+## Testing the Dockerfile
+
+Build: `docker build -f Dockerfile . -t dining`
+Run the server: `docker run -p 127.0.0.1:5010:5010 dining`
+Run bash inside it (for debugging): `docker run --rm -it --entrypoint bash -p 127.0.0.1:5010:5010 dining`
+
 ## Under the hood
 
 We get the entire list of locations from `DINING_URL`, fetch location specifics under their corresponding `CONCEPT_BASE_LINK`, and retrieve soups and specials from `DINING_SOUPS_URL` and `DINING_SPECIALS_URL`, respectively. See the `process()` method in `diningParser.ts` for more details.
 
+Instead of using a naive `fetch`, we use puppeteer to simulate a browser page and wait a generous amount of time before scraping the data from each page.
+
 ## Before submitting a PR
 
 - Make sure all tests pass with `bun run test` or `bun run test --watch` for watch mode. (NOTE! `bun test` does something different and does NOT work!)
diff --git a/bun.lock b/bun.lock
diff --git a/bun.lockb b/bun.lockb
diff --git a/package.json b/package.json
@@ -5,7 +5,8 @@
   "main": "server.ts",
   "scripts": {
     "test": "jest --coverage",
-    "start": "bun --hot run src/server.ts",
+    "dev": "DEV=true bun --hot run src/server.ts",
+    "start": "bun run src/server.ts",
     "build": "bun build ./src/index.ts --outdir dist"
   },
   "repository": {

diff --git a/src/containers/locationBuilder.ts b/src/containers/locationBuilder.ts
@@ -1,7 +1,6 @@
+import Scraper from "utils/requestUtils";
 import { load } from "cheerio";
 import type { Element } from "domhandler";
-
-import { getHTMLResponse } from "utils/requestUtils";
 import { LocationOverwrites } from "overwrites/locationOverwrites";
 import { getTimeRangesFromString } from "./timeBuilder";
 import { ICoordinate, ILocation, ISpecial, ITimeRange } from "../types";
@@ -26,15 +25,17 @@ export default class LocationBuilder {
   private times?: ITimeRange[];
   private specials?: ISpecial[];
   private soups?: ISpecial[];
+  private scraper: Scraper;
 
-  constructor(card: Element) {
+  constructor(card: Element, scraper: Scraper) {
     const link = load(card)("h3.name.detailsLink");
     this.name = link.text().trim();
 
     const conceptId = link.attr("onclick")?.match(/Concept\/(\d+)/)?.[1];
     this.conceptId = conceptId !== undefined ? parseInt(conceptId) : undefined;
 
     this.shortDescription = load(card)("div.description").text().trim();
+    this.scraper = scraper;
   }
   overwriteLocation(locationOverwrites: LocationOverwrites) {
     if (
@@ -70,7 +71,7 @@ export default class LocationBuilder {
     const conceptURL = this.getConceptLink();
     if (!conceptURL) return;
 
-    const $ = load(await getHTMLResponse(conceptURL));
+    const $ = load(await this.scraper.getHTML(conceptURL));
     this.url = conceptURL.toString();
     this.description = $("div.description p").text().trim();
     this.menu = $("div.navItems > a#getMenu").attr("href");

diff --git a/src/parser/diningParser.ts b/src/parser/diningParser.ts
@@ -1,4 +1,4 @@
-import { getHTMLResponse } from "../utils/requestUtils";
+import Scraper from "../utils/requestUtils";
 import { load } from "cheerio";
 import LocationBuilder from "../containers/locationBuilder";
 import { retrieveSpecials } from "../containers/specials/specialsBuilder";
@@ -16,8 +16,11 @@ export default class DiningParser {
     "https://apps.studentaffairs.cmu.edu/dining/conceptinfo/Specials";
   static readonly DINING_SOUPS_URL =
     "https://apps.studentaffairs.cmu.edu/dining/conceptinfo/Soups";
+  private scraper: Scraper;
 
-  constructor() {}
+  constructor(scraper: Scraper) {
+    this.scraper = scraper;
+  }
 
   async process(): Promise<ILocation[]> {
     const locationBuilders =
@@ -38,29 +41,29 @@ export default class DiningParser {
   private async initializeLocationBuildersFromMainPage(): Promise<
     LocationBuilder[]
   > {
-    const mainPageHTML = await getHTMLResponse(
+    const mainPageHTML = await this.scraper.getHTML(
       new URL(DiningParser.DINING_URL)
     );
     const mainContainer = load(mainPageHTML)("div.conceptCards");
-    if (mainContainer === undefined) {
-      throw new Error("Unable to load page");
-    }
     const linkHeaders = mainContainer.find("div.card");
-    if (linkHeaders === undefined) {
-      return [];
+
+    if (linkHeaders.length === 0) {
+      throw new Error("Unable to load page");
     }
-    return Array.from(linkHeaders).map((card) => new LocationBuilder(card));
+    return Array.from(linkHeaders).map(
+      (card) => new LocationBuilder(card, this.scraper)
+    );
   }
 
   private async fetchSpecials(): Promise<
     [Record<number, ISpecial[]>, Record<number, ISpecial[]>]
   > {
     return await Promise.all([
       retrieveSpecials(
-        await getHTMLResponse(new URL(DiningParser.DINING_SPECIALS_URL))
+        await this.scraper.getHTML(new URL(DiningParser.DINING_SPECIALS_URL))
       ),
       retrieveSpecials(
-        await getHTMLResponse(new URL(DiningParser.DINING_SOUPS_URL))
+        await this.scraper.getHTML(new URL(DiningParser.DINING_SOUPS_URL))
       ),
     ]);
   }

diff --git a/src/server.ts b/src/server.ts
@@ -1,6 +1,7 @@
 import { Elysia } from "elysia";
 import { cors } from "@elysiajs/cors";
 import DiningParser from "./parser/diningParser";
+import Scraper from "./utils/requestUtils";
 import { ILocation } from "types";
 
 const PORT = process.env.PORT ?? 5010;
@@ -9,8 +10,11 @@ let cachedLocations: ILocation[];
 async function reload(): Promise<void> {
   const now = new Date();
   console.log(`Reloading Dining API: ${now}`);
-  const parser = new DiningParser();
+  const scraper = new Scraper();
+  await scraper.initialize();
+  const parser = new DiningParser(scraper);
   const locations = await parser.process();
+  await scraper.close();
   if (
     cachedLocations !== undefined &&
     locations.length < cachedLocations.length - 1

diff --git a/src/utils/requestUtils.ts b/src/utils/requestUtils.ts
@@ -1,29 +1,57 @@
-import axios from "axios";
+import puppeteer, { Browser, Page } from "puppeteer";
 import { AXIOS_RETRY_INTERVAL_MS, IS_TESTING } from "../config";
 
 const wait = (ms: number) => {
   return new Promise((re) => setTimeout(re, ms));
 };
 
-export async function getHTMLResponse(
-  url: URL,
-  retriesLeft = 4
-): Promise<string> {
-  try {
-    if (!IS_TESTING) console.log(`Scraping ${url}`);
-    const response = await axios.get(url.toString());
-    if (!IS_TESTING)
+export default class Scraper {
+  private browser?: Browser;
+  private page?: Page;
+  private initialized: Boolean = false;
+
+  async initialize() {
+    this.browser = await puppeteer.launch({ args: ["--no-sandbox"] });
+    this.page = await this.browser.newPage();
+    this.initialized = true;
+  }
+
+  async isInitialized(): Promise<Boolean> {
+    return this.initialized;
+  }
+
+  async getHTML(url: URL, retriesLeft = 4): Promise<string> {
+    if (!this.initialized) {
+      throw new Error("Scraper not initialized");
+    }
+    try {
+      console.log(`Scraping ${url}`);
+      await this.page!.goto(url.toString());
+      if (IS_TESTING || process.env.DEV) {
+        await wait(1000);
+      } else {
+        await wait(10000);
+      }
+      const response = await this.page!.content();
       console.log({
         message: `Scraped ${url}`,
-        html: response.data,
+        html: response,
         url: url.toString(),
       });
-    return response.data;
-  } catch (err) {
-    if (!IS_TESTING) console.error(err);
-    if (retriesLeft > 0) {
-      await wait(AXIOS_RETRY_INTERVAL_MS);
-      return await getHTMLResponse(url, retriesLeft - 1);
-    } else throw err;
+      return response;
+    } catch (err) {
+      if (!IS_TESTING) console.error(err);
+      if (retriesLeft > 0) {
+        await wait(AXIOS_RETRY_INTERVAL_MS);
+        return await this.getHTML(url, retriesLeft - 1);
+      } else throw err;
+    }
+  }
+
+  async close() {
+    if (!this.initialized) {
+      throw new Error("Scraper not initialized");
+    }
+    await this.browser!.close();
   }
 }