From 9a5ab63d15b138681e940983c2b0a106078f75aa Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 13 Nov 2024 22:03:50 +0530 Subject: [PATCH 1/3] feat: extract hidden elements --- maxun-core/src/browserSide/scraper.js | 62 ++++++++++++++------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 79893568f..6e34a6a71 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -250,51 +250,53 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, )); } - /** + /** * Scrapes multiple lists of similar items based on a template item. * @param {Object} config - Configuration object * @param {string} config.listSelector - Selector for the list container(s) * @param {Object.} config.fields - Fields to scrape * @param {number} [config.limit] - Maximum number of items to scrape per list (optional) - * @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors + * @param {boolean} [config.includeHidden=true] - Whether to include hidden elements in scraping * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ - window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - const scrapedData = []; +window.scrapeList = async function ({ listSelector, fields, limit = 10, includeHidden = true }) { + const scrapedData = []; - while (scrapedData.length < limit) { + while (scrapedData.length < limit) { // Get all parent elements matching the listSelector const parentElements = Array.from(document.querySelectorAll(listSelector)); // Iterate through each parent element for (const parent of parentElements) { - if (scrapedData.length >= limit) break; - const record = {}; - - // For each field, select the corresponding element within the parent - for (const [label, { selector, attribute }] of Object.entries(fields)) { - const fieldElement = parent.querySelector(selector); - - if (fieldElement) { - if (attribute === 'innerText') { - record[label] = fieldElement.innerText.trim(); - } else if (attribute === 'innerHTML') { - record[label] = fieldElement.innerHTML.trim(); - } else if (attribute === 'src') { - record[label] = fieldElement.src; - } else if (attribute === 'href') { - record[label] = fieldElement.href; - } else { - record[label] = fieldElement.getAttribute(attribute); - } + if (scrapedData.length >= limit) break; + const record = {}; + + // For each field, select the corresponding element within the parent + for (const [label, { selector, attribute }] of Object.entries(fields)) { + let fieldElement = parent.querySelector(selector); + + // If includeHidden is false, only include visible elements + if (fieldElement && (!includeHidden && fieldElement.offsetParent === null)) continue; + + if (fieldElement) { + if (attribute === 'innerText') { + record[label] = fieldElement.innerText.trim(); + } else if (attribute === 'innerHTML') { + record[label] = fieldElement.innerHTML.trim(); + } else if (attribute === 'src') { + record[label] = fieldElement.src; + } else if (attribute === 'href') { + record[label] = fieldElement.href; + } else { + record[label] = fieldElement.getAttribute(attribute); + } + } } - } - scrapedData.push(record); + scrapedData.push(record); } - } - return scrapedData - }; - + } + return scrapedData; +}; /** * Gets all children of the elements matching the listSelector, From 1c1d03834b1a38d2171c37f4bc5359933daece80 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 28 Nov 2024 19:58:28 +0530 Subject: [PATCH 2/3] feat: add scraping logic for tabular data within scrapeList function --- maxun-core/src/browserSide/scraper.js | 42 +++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 6e34a6a71..61ed6aa3e 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -266,12 +266,47 @@ window.scrapeList = async function ({ listSelector, fields, limit = 10, includeH // Get all parent elements matching the listSelector const parentElements = Array.from(document.querySelectorAll(listSelector)); - // Iterate through each parent element for (const parent of parentElements) { - if (scrapedData.length >= limit) break; + if (scrapedData.length >= limit) break; + + if (parent.tagName.toLowerCase() === 'table') { + let headers = []; + const headerRow = parent.querySelector('thead tr') || table.querySelector('tr'); + + if (headerRow) { + headers = Array.from(headerRow.querySelectorAll('th, td')).map(th => th.innerText.trim()); + } + + const tableRows = table.querySelectorAll('tbody tr'); + const rowsToProcess = tableRows.length > 0 ? tableRows : parent.querySelectorAll('tr'); + + rowsToProcess.forEach((row, rowIndex) => { + // Skip the header row if headers are derived from the first row outside + if ( + rowIndex === 0 && + headers.length > 0 && + headerRow.parentElement.tagName.toLowerCase() !== 'thead' + ) { + return; + } + + const cells = Array.from(row.querySelectorAll('td')); + const rowData = {}; + + cells.forEach((cell, cellIndex) => { + const header = headers[cellIndex] || `Column${cellIndex + 1}`; + rowData[header] = cell.innerText.trim(); + }); + + scrapedData.push(rowData); + + if (scrapedData.length >= limit) { + return; + } + }); + } else { const record = {}; - // For each field, select the corresponding element within the parent for (const [label, { selector, attribute }] of Object.entries(fields)) { let fieldElement = parent.querySelector(selector); @@ -293,6 +328,7 @@ window.scrapeList = async function ({ listSelector, fields, limit = 10, includeH } } scrapedData.push(record); + } } } return scrapedData; From db743fbdf652f6e5882ccc69fc3688e285f7e1af Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 28 Nov 2024 22:16:11 +0530 Subject: [PATCH 3/3] fix: rm all typescript errors --- maxun-core/src/interpret.ts | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index ecef02dbb..eb95be03b 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -16,6 +16,22 @@ import Concurrency from './utils/concurrency'; import Preprocessor from './preprocessor'; import log, { Level } from './utils/logger'; +/** + * Extending the Window interface for custom scraping functions. + */ +declare global { + interface Window { + scrape: (selector: string | null) => Record[]; + scrapeSchema: ( + schema: Record + ) => Record; + scrapeList: (config: { listSelector: string; fields: any; limit?: number; pagination: any }) => Record[]; + scrapeListAuto: (listSelector: string) => { selector: string; innerText: string }[]; + scrollDown: (pages?: number) => void; + scrollUp: (pages?: number) => void; + } +} + /** * Defines optional intepreter options (passed in constructor) */ @@ -214,11 +230,11 @@ export default class Interpreter extends EventEmitter { // every condition is treated as a single context switch (key as keyof typeof operators) { - case '$and': + case '$and' as keyof typeof operators: return array?.every((x) => this.applicable(x, context)); - case '$or': + case '$or' as keyof typeof operators: return array?.some((x) => this.applicable(x, context)); - case '$not': + case '$not' as keyof typeof operators: return !this.applicable(value, context); // $not should be a unary operator default: throw new Error('Undefined logic operator.'); @@ -233,9 +249,9 @@ export default class Interpreter extends EventEmitter { }; switch (key as keyof typeof meta) { - case '$before': + case '$before' as keyof typeof meta: return !usedActions.find(testRegexString); - case '$after': + case '$after' as keyof typeof meta: return !!usedActions.find(testRegexString); default: throw new Error('Undefined meta operator.'); @@ -357,7 +373,7 @@ export default class Interpreter extends EventEmitter { }; for (const step of steps) { - this.log(`Launching ${step.action}`, Level.LOG); + this.log(`Launching ${String(step.action)}`, Level.LOG); if (step.action in wawActions) { // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not) @@ -365,7 +381,7 @@ export default class Interpreter extends EventEmitter { await wawActions[step.action as CustomFunctions](...(params ?? [])); } else { // Implements the dot notation for the "method name" in the workflow - const levels = step.action.split('.'); + const levels = String(step.action).split('.'); const methodName = levels[levels.length - 1]; let invokee: any = page;