diff --git a/cli-manifest.json b/cli-manifest.json index 17b77d182..20d77a567 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -9387,7 +9387,7 @@ "type": "js", "modulePath": "facebook/feed.js", "sourceFile": "facebook/feed.js", - "navigateBefore": "https://www.facebook.com" + "navigateBefore": false }, { "site": "facebook", diff --git a/clis/facebook/feed.js b/clis/facebook/feed.js index a2bd538e0..1690a58d2 100644 --- a/clis/facebook/feed.js +++ b/clis/facebook/feed.js @@ -1,60 +1,305 @@ -import { cli } from '@jackwener/opencli/registry'; -cli({ - site: 'facebook', - name: 'feed', - access: 'read', - description: 'Get your Facebook news feed', - domain: 'www.facebook.com', - args: [ - { name: 'limit', type: 'int', default: 10, help: 'Number of posts' }, - ], - columns: ['index', 'author', 'content', 'likes', 'comments', 'shares'], - pipeline: [ - { navigate: { url: 'https://www.facebook.com/', settleMs: 4000 } }, - { evaluate: `(() => { - const limit = \${{ args.limit }}; - const posts = document.querySelectorAll('[role="article"]'); - return Array.from(posts) - .filter(el => { - const text = el.textContent.trim(); - // Filter out "People you may know" suggestions (both CN and EN) - return text.length > 30 && - !text.startsWith('可能认识') && - !text.startsWith('People you may know') && - !text.startsWith('People You May Know'); - }) - .slice(0, limit) - .map((el, i) => { - // Author from header link - const headerLink = el.querySelector('h2 a, h3 a, h4 a, strong a'); - const author = headerLink ? headerLink.textContent.trim() : ''; - - // Post text: grab visible spans, filter noise - const spans = Array.from(el.querySelectorAll('div[dir="auto"]')) - .map(s => s.textContent.trim()) - .filter(t => t.length > 10 && t.length < 500); - const content = spans.length > 0 ? spans[0] : ''; - - // Engagement: find like/comment/share counts (CN + EN) - const allText = el.textContent; - const likesMatch = allText.match(/所有心情:([\\d,.\\s]*[\\d万亿KMk]+)/) || - allText.match(/All:\\s*([\\d,.KMk]+)/) || - allText.match(/([\\d,.KMk]+)\\s*(?:likes?|reactions?)/i); - const commentsMatch = allText.match(/([\\d,.]+\\s*[万亿]?)\\s*条评论/) || - allText.match(/([\\d,.KMk]+)\\s*comments?/i); - const sharesMatch = allText.match(/([\\d,.]+\\s*[万亿]?)\\s*次分享/) || - allText.match(/([\\d,.KMk]+)\\s*shares?/i); +import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; +import { cli, Strategy } from '@jackwener/opencli/registry'; + +const FACEBOOK_HOME = 'https://www.facebook.com/'; +const MAX_LIMIT = 50; + +function requireLimit(value) { + const n = Number(value); + if (!Number.isInteger(n) || n < 1 || n > MAX_LIMIT) { + throw new ArgumentError(`facebook feed --limit must be an integer between 1 and ${MAX_LIMIT}`); + } + return n; +} + +function unwrapBrowserResult(value) { + if (value && typeof value === 'object' && 'data' in value) { + return value.data; + } + return value; +} + +function buildFeedExtractScript(limit) { + return `(() => { + const limit = ${limit}; + + function clean(value) { + return String(value || '').replace(/\\s+/g, ' ').trim(); + } + + function textOf(el) { + return clean(el && el.textContent); + } + + function labelOf(el) { + return clean(el && el.getAttribute && el.getAttribute('aria-label')); + } + + function isAuthPage() { + const path = window.location && window.location.pathname ? window.location.pathname : ''; + const body = textOf(document.body); + return /^\\/(login|checkpoint)(\\/|$|\\.php)/.test(path) + || /^(Log in to Facebook|Facebook登录|登录 Facebook)/i.test(body) + || /You must log in to continue/i.test(body); + } + + function isExplicitEmptyFeed() { + const body = textOf(document.body); + return /No posts available|Nothing to show|暂无动态|没有更多动态|还没有帖子/i.test(body); + } + + function isSuggestionOrChrome(text) { + return /^(People you may know|People You May Know|可能认识的人?|你可能认识的人?)/i.test(text) + || /^(Suggested for you|Suggested Groups|推荐小组|推荐内容)/i.test(text); + } + + function isSponsored(text) { + return /(^|\\s)(Sponsored|赞助|广告)(\\s|$)/i.test(text); + } + + function isActionText(text) { + return /^(Like|Comment|Share|Send|Follow|赞|评论|分享|发送|关注)$/i.test(text); + } + + function isMetricText(text) { + return /^(All:|所有心情:)/i.test(text) + || /\\b(likes?|reactions?|comments?|shares?)\\b/i.test(text) + || /(条评论|次分享)$/.test(text); + } + + function isTimestampText(text) { + return /^(\\d+\\s*(s|m|h|d|w|mo|yr|min|sec|second|minute|hour|day|week|month|year)s?|Just now|Yesterday|刚刚|昨天|\\d+小时|\\d+天)(\\s*[·•.])?$/i.test(text); + } + + function postUrlFrom(root) { + const links = Array.from(root.querySelectorAll('a[href]')); + for (const link of links) { + const href = link.href || link.getAttribute('href') || ''; + if (/\\/posts\\/|\\/permalink\\.php|\\/story\\.php|\\/photo\\/\\?fbid=|\\/groups\\/[^/]+\\/posts\\//i.test(href)) { + return href; + } + } + return ''; + } + + function actionKinds(root) { + const kinds = new Set(); + for (const el of root.querySelectorAll('[aria-label]')) { + const label = labelOf(el); + if (/^(Like|赞)$/i.test(label)) kinds.add('like'); + if (/^(Comment|评论)$/i.test(label)) kinds.add('comment'); + if (/^(Share|分享)$/i.test(label)) kinds.add('share'); + } + return kinds; + } + + function visibleBlocks(root) { + const seen = new Set(); + return Array.from(root.querySelectorAll('[dir="auto"]')) + .map(textOf) + .filter((text) => { + if (!text || text.length > 600 || seen.has(text)) return false; + seen.add(text); + return true; + }); + } + + function findAuthor(root) { + const links = [ + root.querySelector('h2 a[href], h3 a[href], h4 a[href], strong a[href]'), + ...Array.from(root.querySelectorAll('a[role="link"][href]')), + ].filter(Boolean); + for (const link of links) { + const text = textOf(link); + const href = link.href || link.getAttribute('href') || ''; + if (text.length > 1 && text.length <= 80 + && !isActionText(text) + && !isMetricText(text) + && !isTimestampText(text) + && !/\\/groups\\/|\\/watch\\/|\\/reel\\/|\\/events\\/|\\/friends\\//i.test(href)) { + return text; + } + } + return ''; + } + + function contentBlocks(root, author) { + return visibleBlocks(root).filter((text) => { + if (text === author) return false; + if (text.length <= 10) return false; + if (isSuggestionOrChrome(text) || isSponsored(text)) return false; + if (isActionText(text) || isMetricText(text) || isTimestampText(text)) return false; + if (/^(See more|查看更多|更多)$/i.test(text)) return false; + return true; + }); + } + + function extractPost(root, index) { + const fullText = textOf(root); + if (!fullText || isSuggestionOrChrome(fullText) || isSponsored(fullText)) return null; + + const author = findAuthor(root); + const blocks = contentBlocks(root, author); + const content = clean(blocks.join(' ')); + const postUrl = postUrlFrom(root); + const kinds = actionKinds(root); + + if (!author && !content) return null; + if (!content && !postUrl && kinds.size < 2) return null; + + const likesMatch = fullText.match(/所有心情:\\s*(\\d[\\d,.\\s万亿KMk]*)/) + || fullText.match(/All:\\s*(\\d[\\d,.KMk]*)/) + || fullText.match(/(\\d[\\d,.KMk]*)\\s*(?:likes?|reactions?)/i); + const commentsMatch = fullText.match(/([\\d,.]+\\s*[万亿]?)\\s*条评论/) + || fullText.match(/(\\d[\\d,.KMk]*)\\s*comments?/i); + const sharesMatch = fullText.match(/([\\d,.]+\\s*[万亿]?)\\s*次分享/) + || fullText.match(/(\\d[\\d,.KMk]*)\\s*shares?/i); return { - index: i + 1, + index, author: author.substring(0, 50), - content: content.replace(/\\n/g, ' ').substring(0, 120), - likes: likesMatch ? likesMatch[1] : '-', - comments: commentsMatch ? commentsMatch[1] : '-', - shares: sharesMatch ? sharesMatch[1] : '-', + content: content.substring(0, 120), + likes: likesMatch ? clean(likesMatch[1]) : '-', + comments: commentsMatch ? clean(commentsMatch[1]) : '-', + shares: sharesMatch ? clean(sharesMatch[1]) : '-', }; - }); -})() -` }, - ], -}); + } + + function primaryContainers() { + return Array.from(document.querySelectorAll('[role="article"]')) + .filter((el) => textOf(el).length > 30); + } + + function fallbackContainers() { + const main = document.querySelector('[role="main"]'); + if (!main) return []; + const buttons = Array.from(main.querySelectorAll('[aria-label="Like"], [aria-label="赞"], [aria-label="Comment"], [aria-label="评论"], [aria-label="Share"], [aria-label="分享"]')); + const seen = new WeakSet(); + const containers = []; + for (const button of buttons) { + let node = button.parentElement; + for (let depth = 0; depth < 16 && node && node !== main && node !== document.body; depth += 1, node = node.parentElement) { + const text = textOf(node); + const kinds = actionKinds(node); + const blocks = visibleBlocks(node); + const hasPostEvidence = Boolean(postUrlFrom(node)) || blocks.some((block) => block.length > 20 && !isActionText(block) && !isMetricText(block)); + if (text.length >= 80 && kinds.has('like') && (kinds.has('comment') || kinds.has('share')) && hasPostEvidence) { + if (!seen.has(node)) { + seen.add(node); + containers.push(node); + } + break; + } + } + } + return containers; + } + + function dedupe(containers) { + const seen = new Set(); + const result = []; + for (const node of containers) { + const key = postUrlFrom(node) || contentBlocks(node, findAuthor(node)).join('|').substring(0, 200); + if (!key || seen.has(key)) continue; + seen.add(key); + result.push(node); + } + return result; + } + + if (isAuthPage()) return { status: 'auth', rows: [], diagnostics: {} }; + + const primary = primaryContainers(); + const combined = dedupe([...primary, ...fallbackContainers()]); + const rows = []; + for (const container of combined) { + const row = extractPost(container, rows.length + 1); + if (row) rows.push(row); + if (rows.length >= limit) break; + } + + return { + status: rows.length ? 'ok' : (isExplicitEmptyFeed() ? 'empty' : 'no_rows'), + rows, + diagnostics: { + articleCount: document.querySelectorAll('[role="article"]').length, + primaryCount: primary.length, + fallbackActionCount: document.querySelectorAll('[role="main"] [aria-label="Like"], [role="main"] [aria-label="赞"], [role="main"] [aria-label="Comment"], [role="main"] [aria-label="评论"]').length, + mainTextLength: textOf(document.querySelector('[role="main"]')).length, + }, + }; + })()`; +} + +async function getFacebookFeed(page, kwargs) { + const limit = requireLimit(kwargs.limit ?? 10); + try { + await page.goto(FACEBOOK_HOME, { settleMs: 4000 }); + } catch (err) { + throw new CommandExecutionError( + `Failed to navigate to facebook feed: ${err instanceof Error ? err.message : err}`, + 'Check that facebook.com is reachable and the browser extension is connected.', + ); + } + + let payload; + try { + payload = unwrapBrowserResult(await page.evaluate(buildFeedExtractScript(limit))); + } catch (err) { + throw new CommandExecutionError( + `Failed to read facebook feed: ${err instanceof Error ? err.message : err}`, + 'Facebook may not have rendered or the feed markup may have changed.', + ); + } + + if (!payload || typeof payload !== 'object' || !Array.isArray(payload.rows)) { + throw new CommandExecutionError('facebook feed returned malformed extraction payload'); + } + + if (payload.status === 'auth') { + throw new AuthRequiredError('www.facebook.com', 'Open Chrome and log in to Facebook before retrying.'); + } + + if (payload.rows.length > 0) { + return payload.rows; + } + + if (payload.status === 'empty') { + throw new EmptyResultError('facebook feed', 'Facebook did not show any feed posts for this account.'); + } + + const diagnostics = payload.diagnostics || {}; + if (diagnostics.articleCount || diagnostics.fallbackActionCount || diagnostics.mainTextLength > 200) { + throw new CommandExecutionError( + 'facebook feed page rendered but no feed rows could be extracted', + `Diagnostics: articles=${diagnostics.articleCount || 0}, actions=${diagnostics.fallbackActionCount || 0}, mainTextLength=${diagnostics.mainTextLength || 0}.`, + ); + } + + throw new EmptyResultError('facebook feed', 'No Facebook feed content was visible in the current browser session.'); +} + +const command = { + site: 'facebook', + name: 'feed', + access: 'read', + description: 'Get your Facebook news feed', + domain: 'www.facebook.com', + strategy: Strategy.COOKIE, + browser: true, + navigateBefore: false, + args: [ + { name: 'limit', type: 'int', default: 10, help: 'Number of posts' }, + ], + columns: ['index', 'author', 'content', 'likes', 'comments', 'shares'], + func: getFacebookFeed, +}; + +cli(command); + +export const __test__ = { + buildFeedExtractScript, + command, + getFacebookFeed, + requireLimit, +}; diff --git a/clis/facebook/feed.test.js b/clis/facebook/feed.test.js new file mode 100644 index 000000000..4bb6fd290 --- /dev/null +++ b/clis/facebook/feed.test.js @@ -0,0 +1,169 @@ +import { describe, expect, it, vi } from 'vitest'; +import { JSDOM } from 'jsdom'; +import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; +import { getRegistry } from '@jackwener/opencli/registry'; +import { __test__ } from './feed.js'; + +function runExtract(html, limit = 10, url = 'https://www.facebook.com/') { + const dom = new JSDOM(html, { url }); + return Function('window', 'document', `return ${__test__.buildFeedExtractScript(limit)};`)(dom.window, dom.window.document); +} + +function createPage(payload) { + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate: vi.fn().mockResolvedValue(payload), + }; +} + +describe('facebook feed', () => { + it('registers the feed command with the existing row contract', () => { + const cmd = getRegistry().get('facebook/feed'); + expect(cmd).toBeDefined(); + expect(cmd.columns).toEqual(['index', 'author', 'content', 'likes', 'comments', 'shares']); + }); + + it('extracts existing role=article feed rows', () => { + const payload = runExtract(` + + + Alice Example + This is a normal Facebook feed post with enough text to extract. + All: 12 + 3 comments + 2 shares + + + + `); + + expect(payload.status).toBe('ok'); + expect(payload.rows).toEqual([{ + index: 1, + author: 'Alice Example', + content: 'This is a normal Facebook feed post with enough text to extract.', + likes: '12', + comments: '3', + shares: '2', + }]); + }); + + it('falls back from empty article nodes to action-bounded feed containers', () => { + const payload = runExtract(` + + + + + Bob Builder + Fallback post body from a Facebook feed card with empty article text. + Permalink + All: 1.2K + 4 comments + 1 shares + LikeComment + + + + `); + + expect(payload.status).toBe('ok'); + expect(payload.rows).toEqual([{ + index: 1, + author: 'Bob Builder', + content: 'Fallback post body from a Facebook feed card with empty article text.', + likes: '1.2K', + comments: '4', + shares: '1', + }]); + }); + + it('does not turn suggestions or side chrome action buttons into feed rows', () => { + const payload = runExtract(` + + + + Navigation item with a Like button but not a feed post. + Like + Comment + + + `); + + expect(payload.status).toBe('no_rows'); + expect(payload.rows).toEqual([]); + }); + + it('still considers bounded fallback rows when article nodes are suggestion chrome', () => { + const payload = runExtract(` + + + People you may know + Suggested profile card with enough text to look article-like. + Like + Comment + + + + Dana Poster + Fallback feed post should still be extracted after suggestion articles are filtered. + Permalink + Like + Comment + + + + `, 1); + + expect(payload.status).toBe('ok'); + expect(payload.rows).toEqual([{ + index: 1, + author: 'Dana Poster', + content: 'Fallback feed post should still be extracted after suggestion articles are filtered.', + likes: '-', + comments: '-', + shares: '-', + }]); + }); + + it('reports auth pages from the browser extractor', () => { + const payload = runExtract('Log in to Facebook', 10, 'https://www.facebook.com/login/'); + expect(payload.status).toBe('auth'); + expect(payload.rows).toEqual([]); + }); + + it('validates limit before browser navigation', async () => { + const page = createPage({ status: 'ok', rows: [] }); + await expect(__test__.command.func(page, { limit: 0 })).rejects.toBeInstanceOf(ArgumentError); + expect(page.goto).not.toHaveBeenCalled(); + }); + + it('maps browser envelopes and returns extracted rows', async () => { + const page = createPage({ session: 'site:facebook', data: { status: 'ok', rows: [{ index: 1, author: 'A', content: 'Body', likes: '-', comments: '-', shares: '-' }] } }); + + await expect(__test__.command.func(page, { limit: 1 })).resolves.toEqual([{ + index: 1, + author: 'A', + content: 'Body', + likes: '-', + comments: '-', + shares: '-', + }]); + }); + + it('maps auth, real empty, parser drift, and malformed payloads to typed errors', async () => { + await expect(__test__.command.func(createPage({ status: 'auth', rows: [] }), { limit: 1 })) + .rejects.toBeInstanceOf(AuthRequiredError); + await expect(__test__.command.func(createPage({ status: 'empty', rows: [] }), { limit: 1 })) + .rejects.toBeInstanceOf(EmptyResultError); + await expect(__test__.command.func(createPage({ status: 'no_rows', rows: [], diagnostics: { articleCount: 1, fallbackActionCount: 2, mainTextLength: 500 } }), { limit: 1 })) + .rejects.toBeInstanceOf(CommandExecutionError); + await expect(__test__.command.func(createPage({ rows: null }), { limit: 1 })) + .rejects.toBeInstanceOf(CommandExecutionError); + }); +});