Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions mediaParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import { URL } from 'url';
import logger from './logger'; // Adjust path if necessary

async function extractImages(url) {
// Input validation
if (!url || typeof url !== 'string') {
throw new TypeError('URL must be a non-empty string');
}

try {
// Fetch HTML with proper axios config
const { data: html } = await axios.get(url, {
timeout: 10000,
maxContentLength: 10 * 1024 * 1024,
maxBodyLength: 10 * 1024 * 1024,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; MaxunBot/1.0; +https://maxun.dev)'
},
maxRedirects: 5
});

// Load HTML
const $ = cheerio.load(html, {
decodeEntities: true,
normalizeWhitespace: false
});

const images = [];
const seen = new Set();

$('img').each((index, element) => {
const alt = $(element).attr('alt') || '';

// Handle src
let src = $(element).attr('src');
if (src) {
try {
const absoluteUrl = new URL(src, url).href;
if (!seen.has(absoluteUrl)) {
seen.add(absoluteUrl);
images.push({ url: absoluteUrl, altText: alt });
}
} catch {
logger.warn(`Invalid image URL: ${src}`);
}
}

// Handle srcset
const srcset = $(element).attr('srcset');
if (srcset) {
const srcsetUrls = srcset.split(',').map(s => s.trim().split(/\s+/)[0]);
for (const srcsetUrl of srcsetUrls) {
try {
const absoluteUrl = new URL(srcsetUrl, url).href;
if (!seen.has(absoluteUrl)) {
seen.add(absoluteUrl);
images.push({ url: absoluteUrl, altText: alt });
}
} catch {
logger.warn(`Invalid srcset URL: ${srcsetUrl}`);
}
}
}
});

return images;

} catch (error) {
logger.error('Failed to extract images', { url, error: error.message });
throw new Error(`Failed to extract images from ${url}: ${error.message}`);
}
}
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
"@types/react-dom": "^18.0.1",
"@types/uuid": "^8.3.4",
"airtable": "^0.12.2",
"axios": "^1.9.0",
"axios": "^1.12.2",
"bcrypt": "^5.1.1",
"body-parser": "^1.20.3",
"buffer": "^6.0.3",
"cheerio": "^1.1.2",
"connect-pg-simple": "^10.0.0",
"cookie-parser": "^1.4.6",
"cors": "^2.8.5",
Expand Down Expand Up @@ -80,6 +81,7 @@
"styled-components": "^5.3.3",
"swagger-jsdoc": "^6.2.8",
"swagger-ui-express": "^5.0.1",
"tesseract.js": "^6.0.1",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Unused dependency: tesseract.js is not referenced in the current implementation.

The tesseract.js library was added but is not imported or used in mediaParser.js. Consider removing it unless it's planned for future use (e.g., OCR on images).

If tesseract.js is intended for future functionality, consider adding a comment in the code or creating a follow-up issue to track its implementation.

🤖 Prompt for AI Agents
In package.json around line 84, the dependency "tesseract.js": "^6.0.1" is
unused in the codebase (not imported in mediaParser.js); remove the tesseract.js
entry from package.json and run npm/yarn install to update lockfile, or if it is
intended for future OCR work, leave the dependency but add a short TODO comment
in the relevant module or create a follow-up issue referencing this dependency
and its planned usage so it is tracked.

"typedoc": "^0.23.8",
"typescript": "^4.6.3",
"uuid": "^8.3.2",
Expand Down