-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Feat: Add image extraction for webpages (media parsing) #818
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 1 commit
3a102be
df2efee
caaf131
d24496c
6624bba
3a68e60
9768844
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| const axios = require('axios'); | ||
| const cheerio = require('cheerio'); | ||
| const { URL } = require('url'); | ||
|
|
||
| async function extractImages(url) { | ||
| try { | ||
| // 1. Fetch HTML | ||
| const { data: html } = await axios.get(url); | ||
|
|
||
| // 2. Load HTML into cheerio | ||
| const $ = cheerio.load(html); | ||
|
|
||
| const images = []; | ||
| const seen = new Set(); // to track duplicates | ||
|
|
||
| // 3. Loop through each <img> tag | ||
| $('img').each((index, element) => { | ||
| let src = $(element).attr('src'); | ||
| const alt = $(element).attr('alt') || ''; | ||
|
|
||
| if (src) { | ||
| // 4. Convert relative URLs to absolute URLs | ||
| try { | ||
| src = new URL(src, url).href; | ||
| } catch { | ||
| // skip invalid URLs | ||
| return; | ||
| } | ||
|
|
||
| // 5. Skip duplicates | ||
| if (!seen.has(src)) { | ||
| seen.add(src); | ||
| images.push({ url: src, altText: alt }); | ||
| } | ||
| } | ||
| }); | ||
|
|
||
| return images; | ||
|
|
||
| } catch (error) { | ||
| console.log('Oops! Something went wrong while fetching images:', error.message); | ||
| return []; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,10 +24,11 @@ | |
| "@types/react-dom": "^18.0.1", | ||
| "@types/uuid": "^8.3.4", | ||
| "airtable": "^0.12.2", | ||
| "axios": "^1.9.0", | ||
| "axios": "^1.12.2", | ||
| "bcrypt": "^5.1.1", | ||
| "body-parser": "^1.20.3", | ||
| "buffer": "^6.0.3", | ||
| "cheerio": "^1.1.2", | ||
| "connect-pg-simple": "^10.0.0", | ||
| "cookie-parser": "^1.4.6", | ||
| "cors": "^2.8.5", | ||
|
|
@@ -80,6 +81,7 @@ | |
| "styled-components": "^5.3.3", | ||
| "swagger-jsdoc": "^6.2.8", | ||
| "swagger-ui-express": "^5.0.1", | ||
| "tesseract.js": "^6.0.1", | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unused dependency: tesseract.js is not referenced in the current implementation. The tesseract.js library was added but is not imported or used in mediaParser.js. Consider removing it unless it's planned for future use (e.g., OCR on images). If tesseract.js is intended for future functionality, consider adding a comment in the code or creating a follow-up issue to track its implementation. 🤖 Prompt for AI Agents |
||
| "typedoc": "^0.23.8", | ||
| "typescript": "^4.6.3", | ||
| "uuid": "^8.3.2", | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.