Skip to content

Implement headless browser based scraping #169

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
# Note: include stuff from here if chromium breaks: https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#running-puppeteer-in-docker

FROM oven/bun:latest
# installs appropriate chromium binary for current architecture (x86 vs. ARM) https://github.com/cline/cline/pull/1721
RUN apt update && apt install chromium -y

ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true

WORKDIR /runtime
COPY . /runtime
Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,23 @@ Now install the API's dependencies by 'cd'-ing into the root of the repository a
bun install
```

Then, you can run the server with `bun start` and it should work! You can also use
`bun run start` since `bun start` is its shorthand version.
Then, you can run the server with `bun dev` and it should work! You can also use
`bun run dev` since `bun dev` is its shorthand version.

Note: To add new dependencies, use `bun add dependency-name`. To remove dependencies, use `bun remove dependency-name`. Run `bun outdated` to see what dependencies are outdated and `bun update` to update all outdated dependencies to the latest version.

## Testing the Dockerfile

Build: `docker build -f Dockerfile . -t dining`
Run the server: `docker run -p 127.0.0.1:5010:5010 dining`
Run bash inside it (for debugging): `docker run --rm -it --entrypoint bash -p 127.0.0.1:5010:5010 dining`

## Under the hood

We get the entire list of locations from `DINING_URL`, fetch location specifics under their corresponding `CONCEPT_BASE_LINK`, and retrieve soups and specials from `DINING_SOUPS_URL` and `DINING_SPECIALS_URL`, respectively. See the `process()` method in `diningParser.ts` for more details.

Instead of using a naive `fetch`, we use puppeteer to simulate a browser page and wait a generous amount of time before scraping the data from each page.

## Before submitting a PR

- Make sure all tests pass with `bun run test` or `bun run test --watch` for watch mode. (NOTE! `bun test` does something different and does NOT work!)
9,344 changes: 7,732 additions & 1,612 deletions bun.lock

Large diffs are not rendered by default.

Binary file added bun.lockb
Binary file not shown.
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
"main": "server.ts",
"scripts": {
"test": "jest --coverage",
"start": "bun --hot run src/server.ts",
"dev": "DEV=true bun --hot run src/server.ts",
"start": "bun run src/server.ts",
"build": "bun build ./src/index.ts --outdir dist"
},
"repository": {
Expand Down
9 changes: 5 additions & 4 deletions src/containers/locationBuilder.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import Scraper from "utils/requestUtils";
import { load } from "cheerio";
import type { Element } from "domhandler";

import { getHTMLResponse } from "utils/requestUtils";
import { LocationOverwrites } from "overwrites/locationOverwrites";
import { getTimeRangesFromString } from "./timeBuilder";
import { ICoordinate, ILocation, ISpecial, ITimeRange } from "../types";
Expand All @@ -26,15 +25,17 @@ export default class LocationBuilder {
private times?: ITimeRange[];
private specials?: ISpecial[];
private soups?: ISpecial[];
private scraper: Scraper;

constructor(card: Element) {
constructor(card: Element, scraper: Scraper) {
const link = load(card)("h3.name.detailsLink");
this.name = link.text().trim();

const conceptId = link.attr("onclick")?.match(/Concept\/(\d+)/)?.[1];
this.conceptId = conceptId !== undefined ? parseInt(conceptId) : undefined;

this.shortDescription = load(card)("div.description").text().trim();
this.scraper = scraper;
}
overwriteLocation(locationOverwrites: LocationOverwrites) {
if (
Expand Down Expand Up @@ -70,7 +71,7 @@ export default class LocationBuilder {
const conceptURL = this.getConceptLink();
if (!conceptURL) return;

const $ = load(await getHTMLResponse(conceptURL));
const $ = load(await this.scraper.getHTML(conceptURL));
this.url = conceptURL.toString();
this.description = $("div.description p").text().trim();
this.menu = $("div.navItems > a#getMenu").attr("href");
Expand Down
25 changes: 14 additions & 11 deletions src/parser/diningParser.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getHTMLResponse } from "../utils/requestUtils";
import Scraper from "../utils/requestUtils";
import { load } from "cheerio";
import LocationBuilder from "../containers/locationBuilder";
import { retrieveSpecials } from "../containers/specials/specialsBuilder";
Expand All @@ -16,8 +16,11 @@ export default class DiningParser {
"https://apps.studentaffairs.cmu.edu/dining/conceptinfo/Specials";
static readonly DINING_SOUPS_URL =
"https://apps.studentaffairs.cmu.edu/dining/conceptinfo/Soups";
private scraper: Scraper;

constructor() {}
constructor(scraper: Scraper) {
this.scraper = scraper;
}

async process(): Promise<ILocation[]> {
const locationBuilders =
Expand All @@ -38,29 +41,29 @@ export default class DiningParser {
private async initializeLocationBuildersFromMainPage(): Promise<
LocationBuilder[]
> {
const mainPageHTML = await getHTMLResponse(
const mainPageHTML = await this.scraper.getHTML(
new URL(DiningParser.DINING_URL)
);
const mainContainer = load(mainPageHTML)("div.conceptCards");
if (mainContainer === undefined) {
throw new Error("Unable to load page");
}
const linkHeaders = mainContainer.find("div.card");
if (linkHeaders === undefined) {
return [];

if (linkHeaders.length === 0) {
throw new Error("Unable to load page");
}
return Array.from(linkHeaders).map((card) => new LocationBuilder(card));
return Array.from(linkHeaders).map(
(card) => new LocationBuilder(card, this.scraper)
);
}

private async fetchSpecials(): Promise<
[Record<number, ISpecial[]>, Record<number, ISpecial[]>]
> {
return await Promise.all([
retrieveSpecials(
await getHTMLResponse(new URL(DiningParser.DINING_SPECIALS_URL))
await this.scraper.getHTML(new URL(DiningParser.DINING_SPECIALS_URL))
),
retrieveSpecials(
await getHTMLResponse(new URL(DiningParser.DINING_SOUPS_URL))
await this.scraper.getHTML(new URL(DiningParser.DINING_SOUPS_URL))
),
]);
}
Expand Down
6 changes: 5 additions & 1 deletion src/server.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { Elysia } from "elysia";
import { cors } from "@elysiajs/cors";
import DiningParser from "./parser/diningParser";
import Scraper from "./utils/requestUtils";
import { ILocation } from "types";

const PORT = process.env.PORT ?? 5010;
Expand All @@ -9,8 +10,11 @@ let cachedLocations: ILocation[];
async function reload(): Promise<void> {
const now = new Date();
console.log(`Reloading Dining API: ${now}`);
const parser = new DiningParser();
const scraper = new Scraper();
await scraper.initialize();
const parser = new DiningParser(scraper);
const locations = await parser.process();
await scraper.close();
if (
cachedLocations !== undefined &&
locations.length < cachedLocations.length - 1
Expand Down
62 changes: 45 additions & 17 deletions src/utils/requestUtils.ts
Original file line number Diff line number Diff line change
@@ -1,29 +1,57 @@
import axios from "axios";
import puppeteer, { Browser, Page } from "puppeteer";
import { AXIOS_RETRY_INTERVAL_MS, IS_TESTING } from "../config";

const wait = (ms: number) => {
return new Promise((re) => setTimeout(re, ms));
};

export async function getHTMLResponse(
url: URL,
retriesLeft = 4
): Promise<string> {
try {
if (!IS_TESTING) console.log(`Scraping ${url}`);
const response = await axios.get(url.toString());
if (!IS_TESTING)
export default class Scraper {
private browser?: Browser;
private page?: Page;
private initialized: Boolean = false;

async initialize() {
this.browser = await puppeteer.launch({ args: ["--no-sandbox"] });
this.page = await this.browser.newPage();
this.initialized = true;
}

async isInitialized(): Promise<Boolean> {
return this.initialized;
}

async getHTML(url: URL, retriesLeft = 4): Promise<string> {
if (!this.initialized) {
throw new Error("Scraper not initialized");
}
try {
console.log(`Scraping ${url}`);
await this.page!.goto(url.toString());
if (IS_TESTING || process.env.DEV) {
await wait(1000);
} else {
await wait(10000);
}
const response = await this.page!.content();
console.log({
message: `Scraped ${url}`,
html: response.data,
html: response,
url: url.toString(),
});
return response.data;
} catch (err) {
if (!IS_TESTING) console.error(err);
if (retriesLeft > 0) {
await wait(AXIOS_RETRY_INTERVAL_MS);
return await getHTMLResponse(url, retriesLeft - 1);
} else throw err;
return response;
} catch (err) {
if (!IS_TESTING) console.error(err);
if (retriesLeft > 0) {
await wait(AXIOS_RETRY_INTERVAL_MS);
return await this.getHTML(url, retriesLeft - 1);
} else throw err;
}
}

async close() {
if (!this.initialized) {
throw new Error("Scraper not initialized");
}
await this.browser!.close();
}
}
Loading