Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions mediaParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// mediaParser.js

import axios from 'axios';
import * as cheerio from 'cheerio';
import { URL } from 'url';
import logger from './logger'; // Adjust path if necessary

/**
* Fetches and extracts all images from a webpage, including responsive images.
* This includes regular <img> tags, srcset URLs, and <source> tags within <picture> elements.
*
* @param {string} url - The webpage URL to extract images from.
* Must be a valid, non-empty string.
* @returns {Array} - An array of objects:
* {
* url: string, // The absolute URL of the image
* altText: string // The alt text of the image (if any)
* }
* @throws {TypeError} - If the URL is missing or not a string.
* @throws {Error} - If the fetch fails or the response is not HTML.
*/
async function extractImages(url) {
if (!url || typeof url !== 'string') {
throw new TypeError('URL must be a non-empty string');
}

try {
// Fetch webpage with axios
const response = await axios.get(url, {
timeout: 10000,
maxContentLength: 10 * 1024 * 1024,
maxBodyLength: 10 * 1024 * 1024,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; MaxunBot/1.0; +https://maxun.dev)'
},
maxRedirects: 5
});

// Validate that content is HTML
const contentType = response.headers['content-type'] || '';
if (!contentType.includes('text/html')) {
throw new Error(`Expected HTML but got ${contentType}`);
}

const html = response.data;
const $ = cheerio.load(html, {
decodeEntities: true,
normalizeWhitespace: false
});

const images = [];
const seen = new Set();

// Extract <img> tags
$('img').each((index, element) => {
const alt = $(element).attr('alt') || '';
let src = $(element).attr('src');

if (src) {
try {
const absoluteUrl = new URL(src, url).href;
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
seen.add(absoluteUrl);
images.push({ url: absoluteUrl, altText: alt });
}
} catch {
logger.warn(`Invalid image URL: ${src}`);
}
}

// Handle srcset (responsive images)
const srcset = $(element).attr('srcset');
if (srcset) {
const srcsetUrls = srcset.split(',')
.map(s => s.trim().split(/\s+/)[0])
.filter(Boolean);

for (const srcsetUrl of srcsetUrls) {
try {
const absoluteUrl = new URL(srcsetUrl, url).href;
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
seen.add(absoluteUrl);
images.push({ url: absoluteUrl, altText: alt });
}
} catch {
logger.warn(`Invalid srcset URL: ${srcsetUrl}`);
}
}
}
});

// Extract <source> tags inside <picture> elements
$('picture source').each((i, element) => {
const srcset = $(element).attr('srcset');
if (srcset) {
const srcsetUrls = srcset.split(',')
.map(s => s.trim().split(/\s+/)[0])
.filter(Boolean);

for (const srcsetUrl of srcsetUrls) {
try {
const absoluteUrl = new URL(srcsetUrl, url).href;
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
seen.add(absoluteUrl);
images.push({ url: absoluteUrl, altText: '' });
}
} catch {
logger.warn(`Invalid srcset URL in <source>: ${srcsetUrl}`);
}
}
}
});

return images;

} catch (error) {
// Preserve original stack trace
throw new Error(`Failed to extract images from ${url}`, { cause: error });
}
}

// Export function for other modules
export { extractImages };
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
"@types/react-dom": "^18.0.1",
"@types/uuid": "^8.3.4",
"airtable": "^0.12.2",
"axios": "^1.9.0",
"axios": "^1.12.2",
"bcrypt": "^5.1.1",
"body-parser": "^1.20.3",
"buffer": "^6.0.3",
"cheerio": "^1.1.2",
"connect-pg-simple": "^10.0.0",
"cookie-parser": "^1.4.6",
"cors": "^2.8.5",
Expand Down Expand Up @@ -80,6 +81,7 @@
"styled-components": "^5.3.3",
"swagger-jsdoc": "^6.2.8",
"swagger-ui-express": "^5.0.1",
"tesseract.js": "^6.0.1",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Unused dependency: tesseract.js is not referenced in the current implementation.

The tesseract.js library was added but is not imported or used in mediaParser.js. Consider removing it unless it's planned for future use (e.g., OCR on images).

If tesseract.js is intended for future functionality, consider adding a comment in the code or creating a follow-up issue to track its implementation.

🤖 Prompt for AI Agents
In package.json around line 84, the dependency "tesseract.js": "^6.0.1" is
unused in the codebase (not imported in mediaParser.js); remove the tesseract.js
entry from package.json and run npm/yarn install to update lockfile, or if it is
intended for future OCR work, leave the dependency but add a short TODO comment
in the relevant module or create a follow-up issue referencing this dependency
and its planned usage so it is tracked.

"typedoc": "^0.23.8",
"typescript": "^4.6.3",
"uuid": "^8.3.2",
Expand Down