From 53d77543ecd2bacc41cf9c307c96f288dc523bad Mon Sep 17 00:00:00 2001 From: "J.Chen" Date: Mon, 11 May 2026 02:38:06 +0800 Subject: [PATCH 1/3] fix(facebook/feed): add fallback extraction for empty article nodes Add fallback extraction for Facebook feed posts when [role=article] nodes exist but contain empty text. Includes diagnostic errors, content/author cleanup, nested-container dedupe, and an evaluate-script syntax regression test. --- clis/facebook/feed.js | 159 +++++++++++++++++++++++++++++-------- clis/facebook/feed.test.js | 25 ++++++ 2 files changed, 150 insertions(+), 34 deletions(-) create mode 100644 clis/facebook/feed.test.js diff --git a/clis/facebook/feed.js b/clis/facebook/feed.js index a2bd538e0..ccf4ece5f 100644 --- a/clis/facebook/feed.js +++ b/clis/facebook/feed.js @@ -13,47 +13,138 @@ cli({ { navigate: { url: 'https://www.facebook.com/', settleMs: 4000 } }, { evaluate: `(() => { const limit = \${{ args.limit }}; - const posts = document.querySelectorAll('[role="article"]'); - return Array.from(posts) + + // ── Primary extraction via [role="article"] ────────────────────────── + const articleNodes = document.querySelectorAll('[role="article"]'); + const primaryPosts = Array.from(articleNodes) .filter(el => { const text = el.textContent.trim(); - // Filter out "People you may know" suggestions (both CN and EN) return text.length > 30 && !text.startsWith('可能认识') && !text.startsWith('People you may know') && !text.startsWith('People You May Know'); - }) - .slice(0, limit) - .map((el, i) => { - // Author from header link - const headerLink = el.querySelector('h2 a, h3 a, h4 a, strong a'); - const author = headerLink ? headerLink.textContent.trim() : ''; - - // Post text: grab visible spans, filter noise - const spans = Array.from(el.querySelectorAll('div[dir="auto"]')) - .map(s => s.textContent.trim()) - .filter(t => t.length > 10 && t.length < 500); - const content = spans.length > 0 ? spans[0] : ''; - - // Engagement: find like/comment/share counts (CN + EN) - const allText = el.textContent; - const likesMatch = allText.match(/所有心情:([\\d,.\\s]*[\\d万亿KMk]+)/) || - allText.match(/All:\\s*([\\d,.KMk]+)/) || - allText.match(/([\\d,.KMk]+)\\s*(?:likes?|reactions?)/i); - const commentsMatch = allText.match(/([\\d,.]+\\s*[万亿]?)\\s*条评论/) || - allText.match(/([\\d,.KMk]+)\\s*comments?/i); - const sharesMatch = allText.match(/([\\d,.]+\\s*[万亿]?)\\s*次分享/) || - allText.match(/([\\d,.KMk]+)\\s*shares?/i); - - return { - index: i + 1, - author: author.substring(0, 50), - content: content.replace(/\\n/g, ' ').substring(0, 120), - likes: likesMatch ? likesMatch[1] : '-', - comments: commentsMatch ? commentsMatch[1] : '-', - shares: sharesMatch ? sharesMatch[1] : '-', - }; }); + + // ── Fallback extraction via action buttons ──────────────────────────── + // Facebook periodically restructures its DOM so [role="article"] nodes + // exist but have empty textContent. When that happens we locate post + // boundaries via the Like/Comment action buttons, then walk up the DOM + // to the nearest ancestor that contains meaningful text. + function fallbackExtract() { + const main = document.querySelector('[role="main"]'); + if (!main) return null; + + const likeSelectors = [ + '[aria-label="Like"]', '[aria-label="赞"]', + '[aria-label="Comment"]', '[aria-label="评论"]', + ]; + const actionButtons = Array.from( + main.querySelectorAll(likeSelectors.join(',')) + ); + + const seen = new WeakSet(); + const containers = []; + for (const btn of actionButtons) { + let node = btn.parentElement; + let found = null; + for (let depth = 0; depth < 20 && node; depth++, node = node.parentElement) { + if (node.textContent.trim().length >= 80) { found = node; break; } + } + if (!found || seen.has(found)) continue; + seen.add(found); + containers.push(found); + } + return containers.length ? containers : null; + } + + // ── Extract fields from a post container ───────────────────────────── + function extractPost(el, i) { + // Try progressively broader selectors: heading links → role=link → any profile link → first substantial link + const authorLink = + el.querySelector('h2 a, h3 a, h4 a, strong a') || + el.querySelector('a[href*="/"][role="link"]') || + el.querySelector('a[href*="facebook.com/"]') || + Array.from(el.querySelectorAll('a[href]')).find(a => { + const t = a.textContent.trim(); + return t.length > 2 && t.length < 60 && !/^(like|comment|share|follow|\\d)/i.test(t); + }); + // Fallback for sponsored posts where the advertiser name is not in a link + const author = (authorLink ? authorLink.textContent.trim() : '') || + (() => { + const short = Array.from(el.querySelectorAll('[dir="auto"]')) + .map(s => s.textContent.trim()) + .find(t => t.length > 2 && t.length <= 60 && !t.startsWith('#')); + return short || ''; + })(); + + const seen = new Set(); + const dirAutos = Array.from(el.querySelectorAll('[dir="auto"]')) + .map(s => s.textContent.trim()) + .filter(t => t.length > 10 && t.length < 600 && !seen.has(t) && seen.add(t)); + const content = dirAutos.join(' '); + + const allText = el.textContent; + const likesMatch = allText.match(/所有心情:([\\d,.\\s]*[\\d万亿KMk]+)/) || + allText.match(/All:\\s*([\\d,.KMk]+)/) || + allText.match(/([\\d,.KMk]+)\\s*(?:likes?|reactions?)/i); + const commentsMatch = allText.match(/([\\d,.]+\\s*[万亿]?)\\s*条评论/) || + allText.match(/([\\d,.KMk]+)\\s*comments?/i); + const sharesMatch = allText.match(/([\\d,.]+\\s*[万亿]?)\\s*次分享/) || + allText.match(/([\\d,.KMk]+)\\s*shares?/i); + + return { + index: i + 1, + author: author.substring(0, 50), + content: content.replace(/\\n/g, ' ').substring(0, 120), + likes: likesMatch ? likesMatch[1] : '-', + comments: commentsMatch ? commentsMatch[1] : '-', + shares: sharesMatch ? sharesMatch[1] : '-', + }; + } + + // ── Route: primary alone if sufficient, else supplement with fallback ── + const isNotSuggestion = el => { + const t = el.textContent.trim(); + return !t.startsWith('可能认识') && !t.startsWith('People you may know') && !t.startsWith('People You May Know'); + }; + + if (primaryPosts.length >= limit) { + return primaryPosts.slice(0, limit).map((el, i) => extractPost(el, i)); + } + + const fallbackContainers = fallbackExtract(); + const fallbackPosts = fallbackContainers ? fallbackContainers.filter(isNotSuggestion) : []; + + if (primaryPosts.length > 0 || fallbackPosts.length > 0) { + const primarySet = new WeakSet(primaryPosts); + const extra = fallbackPosts.filter(el => !primarySet.has(el)); + const combined = [...primaryPosts, ...extra]; + // Deduplicate nested containers of the same post: same-post ancestors + // share all [dir="auto"] blocks, so joining them gives a stable signature. + // Different posts by the same author differ in body text even if they + // share an author-name prefix, so they won't collide here. + const seenContent = new Set(); + const deduped = combined.filter(el => { + const key = Array.from(el.querySelectorAll('[dir="auto"]')) + .map(s => s.textContent.trim()).filter(t => t.length > 5) + .join('|').substring(0, 200); + if (!key || seenContent.has(key)) return false; + seenContent.add(key); + return true; + }); + return deduped.slice(0, limit).map((el, i) => extractPost(el, i)); + } + + // ── Diagnostic when both paths return nothing ───────────────────────── + const mainEl = document.querySelector('[role="main"]'); + const articleCount = articleNodes.length; + const mainLen = mainEl ? mainEl.textContent.trim().length : 0; + throw new Error( + 'facebook feed: no posts found. ' + + 'article nodes=' + articleCount + ' (all empty text), ' + + 'main textLength=' + mainLen + '. ' + + 'The page may not be fully loaded or Facebook DOM changed again.' + ); })() ` }, ], diff --git a/clis/facebook/feed.test.js b/clis/facebook/feed.test.js new file mode 100644 index 000000000..1ae1f475f --- /dev/null +++ b/clis/facebook/feed.test.js @@ -0,0 +1,25 @@ +/** + * Regression test: evaluate scripts inside template literals must produce + * syntactically valid JavaScript after framework placeholder substitution. + * Catches double-escaping bugs (\d, \s, \n) that typecheck cannot see + * because the code lives inside a string passed to page.evaluate. + */ +import { describe, expect, it } from 'vitest'; +import { getRegistry } from '@jackwener/opencli/registry'; +import './feed.js'; + +describe('facebook feed evaluate script', () => { + it('produces valid JS after placeholder substitution', () => { + const cmd = getRegistry().get('facebook/feed'); + expect(cmd).toBeDefined(); + + const evaluateStep = cmd.pipeline?.find(step => 'evaluate' in step); + expect(evaluateStep).toBeDefined(); + + // Replace framework placeholders ${{ expr }} with dummy values so + // new Function() can parse the script without substitution support. + const script = evaluateStep.evaluate.replace(/\$\{\{[^}]*\}\}/g, '10'); + + expect(() => new Function(`return (${script})`)).not.toThrow(); + }); +}); From eba7698f0b24929c50f0c26768f8575f8e18c655 Mon Sep 17 00:00:00 2001 From: jackwener Date: Thu, 14 May 2026 18:30:03 +0800 Subject: [PATCH 2/3] fix(facebook): bound feed fallback extraction --- cli-manifest.json | 2 +- clis/facebook/feed.js | 426 +++++++++++++++++++++++++------------ clis/facebook/feed.test.js | 144 +++++++++++-- 3 files changed, 419 insertions(+), 153 deletions(-) diff --git a/cli-manifest.json b/cli-manifest.json index 17b77d182..20d77a567 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -9387,7 +9387,7 @@ "type": "js", "modulePath": "facebook/feed.js", "sourceFile": "facebook/feed.js", - "navigateBefore": "https://www.facebook.com" + "navigateBefore": false }, { "site": "facebook", diff --git a/clis/facebook/feed.js b/clis/facebook/feed.js index ccf4ece5f..b1db20c34 100644 --- a/clis/facebook/feed.js +++ b/clis/facebook/feed.js @@ -1,151 +1,305 @@ -import { cli } from '@jackwener/opencli/registry'; -cli({ - site: 'facebook', - name: 'feed', - access: 'read', - description: 'Get your Facebook news feed', - domain: 'www.facebook.com', - args: [ - { name: 'limit', type: 'int', default: 10, help: 'Number of posts' }, - ], - columns: ['index', 'author', 'content', 'likes', 'comments', 'shares'], - pipeline: [ - { navigate: { url: 'https://www.facebook.com/', settleMs: 4000 } }, - { evaluate: `(() => { - const limit = \${{ args.limit }}; - - // ── Primary extraction via [role="article"] ────────────────────────── - const articleNodes = document.querySelectorAll('[role="article"]'); - const primaryPosts = Array.from(articleNodes) - .filter(el => { - const text = el.textContent.trim(); - return text.length > 30 && - !text.startsWith('可能认识') && - !text.startsWith('People you may know') && - !text.startsWith('People You May Know'); - }); - - // ── Fallback extraction via action buttons ──────────────────────────── - // Facebook periodically restructures its DOM so [role="article"] nodes - // exist but have empty textContent. When that happens we locate post - // boundaries via the Like/Comment action buttons, then walk up the DOM - // to the nearest ancestor that contains meaningful text. - function fallbackExtract() { - const main = document.querySelector('[role="main"]'); - if (!main) return null; - - const likeSelectors = [ - '[aria-label="Like"]', '[aria-label="赞"]', - '[aria-label="Comment"]', '[aria-label="评论"]', - ]; - const actionButtons = Array.from( - main.querySelectorAll(likeSelectors.join(',')) - ); +import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; +import { cli, Strategy } from '@jackwener/opencli/registry'; + +const FACEBOOK_HOME = 'https://www.facebook.com/'; +const MAX_LIMIT = 50; + +function requireLimit(value) { + const n = Number(value); + if (!Number.isInteger(n) || n < 1 || n > MAX_LIMIT) { + throw new ArgumentError(`facebook feed --limit must be an integer between 1 and ${MAX_LIMIT}`); + } + return n; +} + +function unwrapBrowserResult(value) { + if (value && typeof value === 'object' && 'data' in value) { + return value.data; + } + return value; +} + +function buildFeedExtractScript(limit) { + return `(() => { + const limit = ${limit}; + + function clean(value) { + return String(value || '').replace(/\\s+/g, ' ').trim(); + } + + function textOf(el) { + return clean(el && el.textContent); + } + + function labelOf(el) { + return clean(el && el.getAttribute && el.getAttribute('aria-label')); + } + + function isAuthPage() { + const path = window.location && window.location.pathname ? window.location.pathname : ''; + const body = textOf(document.body); + return /^\\/(login|checkpoint)(\\/|$|\\.php)/.test(path) + || /^(Log in to Facebook|Facebook登录|登录 Facebook)/i.test(body) + || /You must log in to continue/i.test(body); + } + + function isExplicitEmptyFeed() { + const body = textOf(document.body); + return /No posts available|Nothing to show|暂无动态|没有更多动态|还没有帖子/i.test(body); + } - const seen = new WeakSet(); - const containers = []; - for (const btn of actionButtons) { - let node = btn.parentElement; - let found = null; - for (let depth = 0; depth < 20 && node; depth++, node = node.parentElement) { - if (node.textContent.trim().length >= 80) { found = node; break; } + function isSuggestionOrChrome(text) { + return /^(People you may know|People You May Know|可能认识的人?|你可能认识的人?)/i.test(text) + || /^(Suggested for you|Suggested Groups|推荐小组|推荐内容)/i.test(text); + } + + function isSponsored(text) { + return /(^|\\s)(Sponsored|赞助|广告)(\\s|$)/i.test(text); + } + + function isActionText(text) { + return /^(Like|Comment|Share|Send|Follow|赞|评论|分享|发送|关注)$/i.test(text); + } + + function isMetricText(text) { + return /^(All:|所有心情:)/i.test(text) + || /\\b(likes?|reactions?|comments?|shares?)\\b/i.test(text) + || /(条评论|次分享)$/.test(text); + } + + function isTimestampText(text) { + return /^(\\d+\\s*(s|m|h|d|w|mo|yr|min|sec|second|minute|hour|day|week|month|year)s?|Just now|Yesterday|刚刚|昨天|\\d+小时|\\d+天)(\\s*[·•.])?$/i.test(text); + } + + function postUrlFrom(root) { + const links = Array.from(root.querySelectorAll('a[href]')); + for (const link of links) { + const href = link.href || link.getAttribute('href') || ''; + if (/\\/posts\\/|\\/permalink\\.php|\\/story\\.php|\\/photo\\/\\?fbid=|\\/groups\\/[^/]+\\/posts\\//i.test(href)) { + return href; + } } - if (!found || seen.has(found)) continue; - seen.add(found); - containers.push(found); + return ''; + } + + function actionKinds(root) { + const kinds = new Set(); + for (const el of root.querySelectorAll('[aria-label]')) { + const label = labelOf(el); + if (/^(Like|赞)$/i.test(label)) kinds.add('like'); + if (/^(Comment|评论)$/i.test(label)) kinds.add('comment'); + if (/^(Share|分享)$/i.test(label)) kinds.add('share'); + } + return kinds; + } + + function visibleBlocks(root) { + const seen = new Set(); + return Array.from(root.querySelectorAll('[dir="auto"]')) + .map(textOf) + .filter((text) => { + if (!text || text.length > 600 || seen.has(text)) return false; + seen.add(text); + return true; + }); + } + + function findAuthor(root) { + const links = [ + root.querySelector('h2 a[href], h3 a[href], h4 a[href], strong a[href]'), + ...Array.from(root.querySelectorAll('a[role="link"][href]')), + ].filter(Boolean); + for (const link of links) { + const text = textOf(link); + const href = link.href || link.getAttribute('href') || ''; + if (text.length > 1 && text.length <= 80 + && !isActionText(text) + && !isMetricText(text) + && !isTimestampText(text) + && !/\\/groups\\/|\\/watch\\/|\\/reel\\/|\\/events\\/|\\/friends\\//i.test(href)) { + return text; + } + } + return ''; } - return containers.length ? containers : null; - } - // ── Extract fields from a post container ───────────────────────────── - function extractPost(el, i) { - // Try progressively broader selectors: heading links → role=link → any profile link → first substantial link - const authorLink = - el.querySelector('h2 a, h3 a, h4 a, strong a') || - el.querySelector('a[href*="/"][role="link"]') || - el.querySelector('a[href*="facebook.com/"]') || - Array.from(el.querySelectorAll('a[href]')).find(a => { - const t = a.textContent.trim(); - return t.length > 2 && t.length < 60 && !/^(like|comment|share|follow|\\d)/i.test(t); + function contentBlocks(root, author) { + return visibleBlocks(root).filter((text) => { + if (text === author) return false; + if (text.length <= 10) return false; + if (isSuggestionOrChrome(text) || isSponsored(text)) return false; + if (isActionText(text) || isMetricText(text) || isTimestampText(text)) return false; + if (/^(See more|查看更多|更多)$/i.test(text)) return false; + return true; }); - // Fallback for sponsored posts where the advertiser name is not in a link - const author = (authorLink ? authorLink.textContent.trim() : '') || - (() => { - const short = Array.from(el.querySelectorAll('[dir="auto"]')) - .map(s => s.textContent.trim()) - .find(t => t.length > 2 && t.length <= 60 && !t.startsWith('#')); - return short || ''; - })(); - - const seen = new Set(); - const dirAutos = Array.from(el.querySelectorAll('[dir="auto"]')) - .map(s => s.textContent.trim()) - .filter(t => t.length > 10 && t.length < 600 && !seen.has(t) && seen.add(t)); - const content = dirAutos.join(' '); - - const allText = el.textContent; - const likesMatch = allText.match(/所有心情:([\\d,.\\s]*[\\d万亿KMk]+)/) || - allText.match(/All:\\s*([\\d,.KMk]+)/) || - allText.match(/([\\d,.KMk]+)\\s*(?:likes?|reactions?)/i); - const commentsMatch = allText.match(/([\\d,.]+\\s*[万亿]?)\\s*条评论/) || - allText.match(/([\\d,.KMk]+)\\s*comments?/i); - const sharesMatch = allText.match(/([\\d,.]+\\s*[万亿]?)\\s*次分享/) || - allText.match(/([\\d,.KMk]+)\\s*shares?/i); + } + + function extractPost(root, index) { + const fullText = textOf(root); + if (!fullText || isSuggestionOrChrome(fullText) || isSponsored(fullText)) return null; + + const author = findAuthor(root); + const blocks = contentBlocks(root, author); + const content = clean(blocks.join(' ')); + const postUrl = postUrlFrom(root); + const kinds = actionKinds(root); + + if (!author && !content) return null; + if (!content && !postUrl && kinds.size < 2) return null; + + const likesMatch = fullText.match(/所有心情:([\\d,.\\s]*[\\d万亿KMk]+)/) + || fullText.match(/All:\\s*([\\d,.KMk]+)/) + || fullText.match(/([\\d,.KMk]+)\\s*(?:likes?|reactions?)/i); + const commentsMatch = fullText.match(/([\\d,.]+\\s*[万亿]?)\\s*条评论/) + || fullText.match(/([\\d,.KMk]+)\\s*comments?/i); + const sharesMatch = fullText.match(/([\\d,.]+\\s*[万亿]?)\\s*次分享/) + || fullText.match(/([\\d,.KMk]+)\\s*shares?/i); + + return { + index, + author: author.substring(0, 50), + content: content.substring(0, 120), + likes: likesMatch ? clean(likesMatch[1]) : '-', + comments: commentsMatch ? clean(commentsMatch[1]) : '-', + shares: sharesMatch ? clean(sharesMatch[1]) : '-', + }; + } + + function primaryContainers() { + return Array.from(document.querySelectorAll('[role="article"]')) + .filter((el) => textOf(el).length > 30); + } + + function fallbackContainers() { + const main = document.querySelector('[role="main"]'); + if (!main) return []; + const buttons = Array.from(main.querySelectorAll('[aria-label="Like"], [aria-label="赞"], [aria-label="Comment"], [aria-label="评论"], [aria-label="Share"], [aria-label="分享"]')); + const seen = new WeakSet(); + const containers = []; + for (const button of buttons) { + let node = button.parentElement; + for (let depth = 0; depth < 16 && node && node !== main && node !== document.body; depth += 1, node = node.parentElement) { + const text = textOf(node); + const kinds = actionKinds(node); + const blocks = visibleBlocks(node); + const hasPostEvidence = Boolean(postUrlFrom(node)) || blocks.some((block) => block.length > 20 && !isActionText(block) && !isMetricText(block)); + if (text.length >= 80 && kinds.has('like') && (kinds.has('comment') || kinds.has('share')) && hasPostEvidence) { + if (!seen.has(node)) { + seen.add(node); + containers.push(node); + } + break; + } + } + } + return containers; + } + + function dedupe(containers) { + const seen = new Set(); + const result = []; + for (const node of containers) { + const key = postUrlFrom(node) || contentBlocks(node, findAuthor(node)).join('|').substring(0, 200); + if (!key || seen.has(key)) continue; + seen.add(key); + result.push(node); + } + return result; + } + + if (isAuthPage()) return { status: 'auth', rows: [], diagnostics: {} }; + + const primary = primaryContainers(); + const combined = primary.length >= limit ? primary : dedupe([...primary, ...fallbackContainers()]); + const rows = []; + for (const container of combined) { + const row = extractPost(container, rows.length + 1); + if (row) rows.push(row); + if (rows.length >= limit) break; + } return { - index: i + 1, - author: author.substring(0, 50), - content: content.replace(/\\n/g, ' ').substring(0, 120), - likes: likesMatch ? likesMatch[1] : '-', - comments: commentsMatch ? commentsMatch[1] : '-', - shares: sharesMatch ? sharesMatch[1] : '-', + status: rows.length ? 'ok' : (isExplicitEmptyFeed() ? 'empty' : 'no_rows'), + rows, + diagnostics: { + articleCount: document.querySelectorAll('[role="article"]').length, + primaryCount: primary.length, + fallbackActionCount: document.querySelectorAll('[role="main"] [aria-label="Like"], [role="main"] [aria-label="赞"], [role="main"] [aria-label="Comment"], [role="main"] [aria-label="评论"]').length, + mainTextLength: textOf(document.querySelector('[role="main"]')).length, + }, }; + })()`; +} + +async function getFacebookFeed(page, kwargs) { + const limit = requireLimit(kwargs.limit ?? 10); + try { + await page.goto(FACEBOOK_HOME, { settleMs: 4000 }); + } catch (err) { + throw new CommandExecutionError( + `Failed to navigate to facebook feed: ${err instanceof Error ? err.message : err}`, + 'Check that facebook.com is reachable and the browser extension is connected.', + ); } - // ── Route: primary alone if sufficient, else supplement with fallback ── - const isNotSuggestion = el => { - const t = el.textContent.trim(); - return !t.startsWith('可能认识') && !t.startsWith('People you may know') && !t.startsWith('People You May Know'); - }; + let payload; + try { + payload = unwrapBrowserResult(await page.evaluate(buildFeedExtractScript(limit))); + } catch (err) { + throw new CommandExecutionError( + `Failed to read facebook feed: ${err instanceof Error ? err.message : err}`, + 'Facebook may not have rendered or the feed markup may have changed.', + ); + } + + if (!payload || typeof payload !== 'object' || !Array.isArray(payload.rows)) { + throw new CommandExecutionError('facebook feed returned malformed extraction payload'); + } - if (primaryPosts.length >= limit) { - return primaryPosts.slice(0, limit).map((el, i) => extractPost(el, i)); + if (payload.status === 'auth') { + throw new AuthRequiredError('www.facebook.com', 'Open Chrome and log in to Facebook before retrying.'); } - const fallbackContainers = fallbackExtract(); - const fallbackPosts = fallbackContainers ? fallbackContainers.filter(isNotSuggestion) : []; - - if (primaryPosts.length > 0 || fallbackPosts.length > 0) { - const primarySet = new WeakSet(primaryPosts); - const extra = fallbackPosts.filter(el => !primarySet.has(el)); - const combined = [...primaryPosts, ...extra]; - // Deduplicate nested containers of the same post: same-post ancestors - // share all [dir="auto"] blocks, so joining them gives a stable signature. - // Different posts by the same author differ in body text even if they - // share an author-name prefix, so they won't collide here. - const seenContent = new Set(); - const deduped = combined.filter(el => { - const key = Array.from(el.querySelectorAll('[dir="auto"]')) - .map(s => s.textContent.trim()).filter(t => t.length > 5) - .join('|').substring(0, 200); - if (!key || seenContent.has(key)) return false; - seenContent.add(key); - return true; - }); - return deduped.slice(0, limit).map((el, i) => extractPost(el, i)); + if (payload.rows.length > 0) { + return payload.rows; } - // ── Diagnostic when both paths return nothing ───────────────────────── - const mainEl = document.querySelector('[role="main"]'); - const articleCount = articleNodes.length; - const mainLen = mainEl ? mainEl.textContent.trim().length : 0; - throw new Error( - 'facebook feed: no posts found. ' + - 'article nodes=' + articleCount + ' (all empty text), ' + - 'main textLength=' + mainLen + '. ' + - 'The page may not be fully loaded or Facebook DOM changed again.' - ); -})() -` }, - ], -}); + if (payload.status === 'empty') { + throw new EmptyResultError('facebook feed', 'Facebook did not show any feed posts for this account.'); + } + + const diagnostics = payload.diagnostics || {}; + if (diagnostics.articleCount || diagnostics.fallbackActionCount || diagnostics.mainTextLength > 200) { + throw new CommandExecutionError( + 'facebook feed page rendered but no feed rows could be extracted', + `Diagnostics: articles=${diagnostics.articleCount || 0}, actions=${diagnostics.fallbackActionCount || 0}, mainTextLength=${diagnostics.mainTextLength || 0}.`, + ); + } + + throw new EmptyResultError('facebook feed', 'No Facebook feed content was visible in the current browser session.'); +} + +const command = { + site: 'facebook', + name: 'feed', + access: 'read', + description: 'Get your Facebook news feed', + domain: 'www.facebook.com', + strategy: Strategy.COOKIE, + browser: true, + navigateBefore: false, + args: [ + { name: 'limit', type: 'int', default: 10, help: 'Number of posts' }, + ], + columns: ['index', 'author', 'content', 'likes', 'comments', 'shares'], + func: getFacebookFeed, +}; + +cli(command); + +export const __test__ = { + buildFeedExtractScript, + command, + getFacebookFeed, + requireLimit, +}; diff --git a/clis/facebook/feed.test.js b/clis/facebook/feed.test.js index 1ae1f475f..b836480de 100644 --- a/clis/facebook/feed.test.js +++ b/clis/facebook/feed.test.js @@ -1,25 +1,137 @@ -/** - * Regression test: evaluate scripts inside template literals must produce - * syntactically valid JavaScript after framework placeholder substitution. - * Catches double-escaping bugs (\d, \s, \n) that typecheck cannot see - * because the code lives inside a string passed to page.evaluate. - */ -import { describe, expect, it } from 'vitest'; +import { describe, expect, it, vi } from 'vitest'; +import { JSDOM } from 'jsdom'; +import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; import { getRegistry } from '@jackwener/opencli/registry'; -import './feed.js'; +import { __test__ } from './feed.js'; -describe('facebook feed evaluate script', () => { - it('produces valid JS after placeholder substitution', () => { +function runExtract(html, limit = 10, url = 'https://www.facebook.com/') { + const dom = new JSDOM(html, { url }); + return Function('window', 'document', `return ${__test__.buildFeedExtractScript(limit)};`)(dom.window, dom.window.document); +} + +function createPage(payload) { + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate: vi.fn().mockResolvedValue(payload), + }; +} + +describe('facebook feed', () => { + it('registers the feed command with the existing row contract', () => { const cmd = getRegistry().get('facebook/feed'); expect(cmd).toBeDefined(); + expect(cmd.columns).toEqual(['index', 'author', 'content', 'likes', 'comments', 'shares']); + }); + + it('extracts existing role=article feed rows', () => { + const payload = runExtract(` +
+
+

Alice Example

+
This is a normal Facebook feed post with enough text to extract.
+ All: 12 + 3 comments + 2 shares +
+
+
+ `); + + expect(payload.status).toBe('ok'); + expect(payload.rows).toEqual([{ + index: 1, + author: 'Alice Example', + content: 'This is a normal Facebook feed post with enough text to extract.', + likes: '12', + comments: '3', + shares: '2', + }]); + }); + + it('falls back from empty article nodes to action-bounded feed containers', () => { + const payload = runExtract(` +
+
+
+
+

Bob Builder

+
Fallback post body from a Facebook feed card with empty article text.
+ Permalink + All: 1.2K + 4 comments + 1 shares +
+
+
+
+ `); + + expect(payload.status).toBe('ok'); + expect(payload.rows).toEqual([{ + index: 1, + author: 'Bob Builder', + content: 'Fallback post body from a Facebook feed card with empty article text.', + likes: '1.2K', + comments: '4', + shares: '1', + }]); + }); - const evaluateStep = cmd.pipeline?.find(step => 'evaluate' in step); - expect(evaluateStep).toBeDefined(); + it('does not turn suggestions or side chrome action buttons into feed rows', () => { + const payload = runExtract(` +
+ + +
+ `); - // Replace framework placeholders ${{ expr }} with dummy values so - // new Function() can parse the script without substitution support. - const script = evaluateStep.evaluate.replace(/\$\{\{[^}]*\}\}/g, '10'); + expect(payload.status).toBe('no_rows'); + expect(payload.rows).toEqual([]); + }); + + it('reports auth pages from the browser extractor', () => { + const payload = runExtract('
Log in to Facebook
', 10, 'https://www.facebook.com/login/'); + expect(payload.status).toBe('auth'); + expect(payload.rows).toEqual([]); + }); + + it('validates limit before browser navigation', async () => { + const page = createPage({ status: 'ok', rows: [] }); + await expect(__test__.command.func(page, { limit: 0 })).rejects.toBeInstanceOf(ArgumentError); + expect(page.goto).not.toHaveBeenCalled(); + }); + + it('maps browser envelopes and returns extracted rows', async () => { + const page = createPage({ session: 'site:facebook', data: { status: 'ok', rows: [{ index: 1, author: 'A', content: 'Body', likes: '-', comments: '-', shares: '-' }] } }); + + await expect(__test__.command.func(page, { limit: 1 })).resolves.toEqual([{ + index: 1, + author: 'A', + content: 'Body', + likes: '-', + comments: '-', + shares: '-', + }]); + }); - expect(() => new Function(`return (${script})`)).not.toThrow(); + it('maps auth, real empty, parser drift, and malformed payloads to typed errors', async () => { + await expect(__test__.command.func(createPage({ status: 'auth', rows: [] }), { limit: 1 })) + .rejects.toBeInstanceOf(AuthRequiredError); + await expect(__test__.command.func(createPage({ status: 'empty', rows: [] }), { limit: 1 })) + .rejects.toBeInstanceOf(EmptyResultError); + await expect(__test__.command.func(createPage({ status: 'no_rows', rows: [], diagnostics: { articleCount: 1, fallbackActionCount: 2, mainTextLength: 500 } }), { limit: 1 })) + .rejects.toBeInstanceOf(CommandExecutionError); + await expect(__test__.command.func(createPage({ rows: null }), { limit: 1 })) + .rejects.toBeInstanceOf(CommandExecutionError); }); }); From b57c257978ed3dfc92ed9bc55b46666e17015537 Mon Sep 17 00:00:00 2001 From: jackwener Date: Thu, 14 May 2026 18:38:19 +0800 Subject: [PATCH 3/3] fix(facebook): keep feed fallback available after chrome articles --- clis/facebook/feed.js | 12 ++++++------ clis/facebook/feed.test.js | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/clis/facebook/feed.js b/clis/facebook/feed.js index b1db20c34..1690a58d2 100644 --- a/clis/facebook/feed.js +++ b/clis/facebook/feed.js @@ -147,13 +147,13 @@ function buildFeedExtractScript(limit) { if (!author && !content) return null; if (!content && !postUrl && kinds.size < 2) return null; - const likesMatch = fullText.match(/所有心情:([\\d,.\\s]*[\\d万亿KMk]+)/) - || fullText.match(/All:\\s*([\\d,.KMk]+)/) - || fullText.match(/([\\d,.KMk]+)\\s*(?:likes?|reactions?)/i); + const likesMatch = fullText.match(/所有心情:\\s*(\\d[\\d,.\\s万亿KMk]*)/) + || fullText.match(/All:\\s*(\\d[\\d,.KMk]*)/) + || fullText.match(/(\\d[\\d,.KMk]*)\\s*(?:likes?|reactions?)/i); const commentsMatch = fullText.match(/([\\d,.]+\\s*[万亿]?)\\s*条评论/) - || fullText.match(/([\\d,.KMk]+)\\s*comments?/i); + || fullText.match(/(\\d[\\d,.KMk]*)\\s*comments?/i); const sharesMatch = fullText.match(/([\\d,.]+\\s*[万亿]?)\\s*次分享/) - || fullText.match(/([\\d,.KMk]+)\\s*shares?/i); + || fullText.match(/(\\d[\\d,.KMk]*)\\s*shares?/i); return { index, @@ -210,7 +210,7 @@ function buildFeedExtractScript(limit) { if (isAuthPage()) return { status: 'auth', rows: [], diagnostics: {} }; const primary = primaryContainers(); - const combined = primary.length >= limit ? primary : dedupe([...primary, ...fallbackContainers()]); + const combined = dedupe([...primary, ...fallbackContainers()]); const rows = []; for (const container of combined) { const row = extractPost(container, rows.length + 1); diff --git a/clis/facebook/feed.test.js b/clis/facebook/feed.test.js index b836480de..4bb6fd290 100644 --- a/clis/facebook/feed.test.js +++ b/clis/facebook/feed.test.js @@ -99,6 +99,38 @@ describe('facebook feed', () => { expect(payload.rows).toEqual([]); }); + it('still considers bounded fallback rows when article nodes are suggestion chrome', () => { + const payload = runExtract(` +
+
+

People you may know

+
Suggested profile card with enough text to look article-like.
+ + +
+
+
+

Dana Poster

+
Fallback feed post should still be extracted after suggestion articles are filtered.
+ Permalink + + +
+
+
+ `, 1); + + expect(payload.status).toBe('ok'); + expect(payload.rows).toEqual([{ + index: 1, + author: 'Dana Poster', + content: 'Fallback feed post should still be extracted after suggestion articles are filtered.', + likes: '-', + comments: '-', + shares: '-', + }]); + }); + it('reports auth pages from the browser extractor', () => { const payload = runExtract('
Log in to Facebook
', 10, 'https://www.facebook.com/login/'); expect(payload.status).toBe('auth');