Skip to content

feat: improve extraction [wip] #169

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 66 additions & 28 deletions maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -250,51 +250,89 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
));
}

/**
/**
* Scrapes multiple lists of similar items based on a template item.
* @param {Object} config - Configuration object
* @param {string} config.listSelector - Selector for the list container(s)
* @param {Object.<string, {selector: string, attribute?: string}>} config.fields - Fields to scrape
* @param {number} [config.limit] - Maximum number of items to scrape per list (optional)
* @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
* @param {boolean} [config.includeHidden=true] - Whether to include hidden elements in scraping
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
const scrapedData = [];
window.scrapeList = async function ({ listSelector, fields, limit = 10, includeHidden = true }) {
const scrapedData = [];

while (scrapedData.length < limit) {
while (scrapedData.length < limit) {
// Get all parent elements matching the listSelector
const parentElements = Array.from(document.querySelectorAll(listSelector));

// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};

// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);

if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
record[label] = fieldElement.src;
} else if (attribute === 'href') {
record[label] = fieldElement.href;
} else {
record[label] = fieldElement.getAttribute(attribute);

if (parent.tagName.toLowerCase() === 'table') {
let headers = [];
const headerRow = parent.querySelector('thead tr') || table.querySelector('tr');

if (headerRow) {
headers = Array.from(headerRow.querySelectorAll('th, td')).map(th => th.innerText.trim());
}

const tableRows = table.querySelectorAll('tbody tr');
const rowsToProcess = tableRows.length > 0 ? tableRows : parent.querySelectorAll('tr');

rowsToProcess.forEach((row, rowIndex) => {
// Skip the header row if headers are derived from the first row outside <thead>
if (
rowIndex === 0 &&
headers.length > 0 &&
headerRow.parentElement.tagName.toLowerCase() !== 'thead'
) {
return;
}

const cells = Array.from(row.querySelectorAll('td'));
const rowData = {};

cells.forEach((cell, cellIndex) => {
const header = headers[cellIndex] || `Column${cellIndex + 1}`;
rowData[header] = cell.innerText.trim();
});

scrapedData.push(rowData);

if (scrapedData.length >= limit) {
return;
}
});
} else {
const record = {};

for (const [label, { selector, attribute }] of Object.entries(fields)) {
let fieldElement = parent.querySelector(selector);

// If includeHidden is false, only include visible elements
if (fieldElement && (!includeHidden && fieldElement.offsetParent === null)) continue;

if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
record[label] = fieldElement.src;
} else if (attribute === 'href') {
record[label] = fieldElement.href;
} else {
record[label] = fieldElement.getAttribute(attribute);
}
}
}
scrapedData.push(record);
}
scrapedData.push(record);
}
}
return scrapedData
};

}
return scrapedData;
};

/**
* Gets all children of the elements matching the listSelector,
Expand Down
30 changes: 23 additions & 7 deletions maxun-core/src/interpret.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,22 @@ import Concurrency from './utils/concurrency';
import Preprocessor from './preprocessor';
import log, { Level } from './utils/logger';

/**
* Extending the Window interface for custom scraping functions.
*/
declare global {
interface Window {
scrape: (selector: string | null) => Record<string, string>[];
scrapeSchema: (
schema: Record<string, { selector: string; tag: string; attribute: string }>
) => Record<string, any>;
scrapeList: (config: { listSelector: string; fields: any; limit?: number; pagination: any }) => Record<string, any>[];
scrapeListAuto: (listSelector: string) => { selector: string; innerText: string }[];
scrollDown: (pages?: number) => void;
scrollUp: (pages?: number) => void;
}
}

/**
* Defines optional intepreter options (passed in constructor)
*/
Expand Down Expand Up @@ -214,11 +230,11 @@ export default class Interpreter extends EventEmitter {
// every condition is treated as a single context

switch (key as keyof typeof operators) {
case '$and':
case '$and' as keyof typeof operators:
return array?.every((x) => this.applicable(x, context));
case '$or':
case '$or' as keyof typeof operators:
return array?.some((x) => this.applicable(x, context));
case '$not':
case '$not' as keyof typeof operators:
return !this.applicable(<Where>value, context); // $not should be a unary operator
default:
throw new Error('Undefined logic operator.');
Expand All @@ -233,9 +249,9 @@ export default class Interpreter extends EventEmitter {
};

switch (key as keyof typeof meta) {
case '$before':
case '$before' as keyof typeof meta:
return !usedActions.find(testRegexString);
case '$after':
case '$after' as keyof typeof meta:
return !!usedActions.find(testRegexString);
default:
throw new Error('Undefined meta operator.');
Expand Down Expand Up @@ -357,15 +373,15 @@ export default class Interpreter extends EventEmitter {
};

for (const step of steps) {
this.log(`Launching ${step.action}`, Level.LOG);
this.log(`Launching ${String(step.action)}`, Level.LOG);

if (step.action in wawActions) {
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
await wawActions[step.action as CustomFunctions](...(params ?? []));
} else {
// Implements the dot notation for the "method name" in the workflow
const levels = step.action.split('.');
const levels = String(step.action).split('.');
const methodName = levels[levels.length - 1];

let invokee: any = page;
Expand Down