diff --git a/cli-manifest.json b/cli-manifest.json index 2e4b709a0..c437c167a 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -21246,6 +21246,46 @@ "sourceFile": "tdx/hot-rank.js", "navigateBefore": true }, + { + "site": "threads", + "name": "search", + "description": "Search Threads posts using the logged-in browser session", + "access": "read", + "domain": "www.threads.com", + "strategy": "ui", + "browser": true, + "args": [ + { + "name": "query", + "type": "str", + "required": true, + "positional": true, + "help": "Search keyword" + }, + { + "name": "limit", + "type": "int", + "default": 20, + "required": false, + "help": "Max results (1-50)" + } + ], + "columns": [ + "rank", + "username", + "displayName", + "text", + "timestamp", + "url", + "replyCount", + "repostCount", + "likeCount" + ], + "type": "js", + "modulePath": "threads/search.js", + "sourceFile": "threads/search.js", + "navigateBefore": false + }, { "site": "ths", "name": "hot-rank", diff --git a/clis/threads/search.js b/clis/threads/search.js new file mode 100644 index 000000000..aaf7ac1f2 --- /dev/null +++ b/clis/threads/search.js @@ -0,0 +1,235 @@ +import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError, TimeoutError } from '@jackwener/opencli/errors'; +import { cli, Strategy } from '@jackwener/opencli/registry'; + +const BASE_URL = 'https://www.threads.com'; +const DEFAULT_LIMIT = 20; +const MAX_LIMIT = 50; + +export function normalizeQuery(value) { + const query = String(value ?? '').trim(); + if (!query) throw new ArgumentError('query is required'); + return query; +} + +export function normalizeLimit(value, defaultValue = DEFAULT_LIMIT, maxValue = MAX_LIMIT) { + const raw = value ?? defaultValue; + const limit = Number(raw); + if (!Number.isInteger(limit) || limit <= 0) throw new ArgumentError('limit must be a positive integer'); + if (limit > maxValue) throw new ArgumentError(`limit must be <= ${maxValue}`); + return limit; +} + +export function buildSearchUrl(query) { + const url = new URL('/search', BASE_URL); + url.searchParams.set('q', query); + url.searchParams.set('serp_type', 'default'); + return url.toString(); +} + +export function parseCompactCount(value) { + const text = String(value ?? '').replace(/,/g, '').trim(); + if (!text) return null; + const match = text.match(/^([0-9]+(?:\.[0-9]+)?)\s*(万|[KkMm])?$/); + if (!match) return null; + const n = Number(match[1]); + const unit = match[2] ?? ''; + const multiplier = unit === '万' ? 10_000 : /k/i.test(unit) ? 1_000 : /m/i.test(unit) ? 1_000_000 : 1; + return Math.round(n * multiplier); +} + +export function parseThreadsTimestamp(value) { + const text = String(value ?? '').trim(); + const match = text.match(/^(\d{4})年(\d{1,2})月(\d{1,2})日.*?(\d{1,2}):(\d{2})$/); + if (!match) return text || null; + const [, year, month, day, hour, minute] = match; + const date = new Date(Number(year), Number(month) - 1, Number(day), Number(hour), Number(minute), 0); + if (Number.isNaN(date.getTime())) return text; + return date.toISOString(); +} + +function normalizeExtractedRows(rows, limit) { + if (!Array.isArray(rows)) throw new CommandExecutionError('Threads search response shape changed: rows missing'); + return rows + .filter((row) => row && row.username && row.url && row.text) + .slice(0, limit) + .map((row, index) => ({ + rank: index + 1, + username: String(row.username), + displayName: row.displayName ? String(row.displayName) : null, + text: String(row.text).trim(), + timestamp: row.timestamp ? String(row.timestamp) : null, + url: String(row.url), + replyCount: Number.isFinite(row.replyCount) ? row.replyCount : null, + repostCount: Number.isFinite(row.repostCount) ? row.repostCount : null, + likeCount: Number.isFinite(row.likeCount) ? row.likeCount : null, + })); +} + +const EXTRACT_SEARCH_ROWS_SCRIPT = ` + (async () => { + const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + const normalize = (value) => String(value || '').replace(/\\s+/g, ' ').trim(); + const normalizeLines = (value) => String(value || '').split('\\n').map((line) => line.trim()).filter(Boolean); + const parseCount = (value) => { + const text = String(value || '').replace(/,/g, '').trim(); + if (!text) return null; + const match = text.match(/^([0-9]+(?:\\.[0-9]+)?)\\s*(万|[KkMm])?$/); + if (!match) return null; + const unit = match[2] || ''; + const multiplier = unit === '万' ? 10000 : /k/i.test(unit) ? 1000 : /m/i.test(unit) ? 1000000 : 1; + return Math.round(Number(match[1]) * multiplier); + }; + const parseTimestamp = (value) => { + const text = String(value || '').trim(); + const match = text.match(/^(\\d{4})年(\\d{1,2})月(\\d{1,2})日.*?(\\d{1,2}):(\\d{2})$/); + if (!match) return text || null; + const date = new Date(Number(match[1]), Number(match[2]) - 1, Number(match[3]), Number(match[4]), Number(match[5]), 0); + return Number.isNaN(date.getTime()) ? text : date.toISOString(); + }; + const isProfileHref = (href) => /^\\/@[^/?#]+\\/?$/.test(href || ''); + const isPostHref = (href) => /\\/post\\//.test(href || '') && !/\\/media(?:[?#/]|$)/.test(href || ''); + const absoluteUrl = (href) => { + try { return new URL(href, location.origin).toString().split('?')[0]; } + catch { return String(href || ''); } + }; + const buttonForSvg = (svg) => { + let node = svg; + for (let i = 0; i < 6 && node; i += 1, node = node.parentElement) { + if ((node.getAttribute('role') || '').includes('button')) return node; + } + return null; + }; + const actionCount = (card, labels) => { + for (const svg of Array.from(card.querySelectorAll('svg[aria-label]'))) { + if (!labels.includes(svg.getAttribute('aria-label'))) continue; + const button = buttonForSvg(svg); + const count = parseCount(button && button.innerText); + if (count !== null) return count; + } + return null; + }; + const findCard = (postLink) => { + let card = postLink; + for (let depth = 0; depth < 14 && card; depth += 1, card = card.parentElement) { + const uniquePostUrls = new Set( + Array.from(card.querySelectorAll('a')) + .map((link) => absoluteUrl(link.getAttribute('href') || '')) + .filter((href) => href.includes('/post/') && !href.includes('/media')) + ); + const profile = Array.from(card.querySelectorAll('a')).find((link) => isProfileHref(link.getAttribute('href') || '')); + const time = card.querySelector('time'); + const labels = Array.from(card.querySelectorAll('svg[aria-label]')).map((svg) => svg.getAttribute('aria-label')); + if (uniquePostUrls.size === 1 && profile && time && labels.some((label) => ['赞', '回复', '评论', '转发', '分享'].includes(label))) { + return card; + } + } + return null; + }; + const cleanPostText = (card, username, displayName, visibleTime) => { + const remove = new Set([username, displayName, visibleTime, '翻译', '关注', '更多', '已编辑'].filter(Boolean)); + for (const svg of Array.from(card.querySelectorAll('svg[aria-label]'))) { + const button = buttonForSvg(svg); + const buttonText = normalize(button && button.innerText); + if (buttonText) remove.add(buttonText); + } + return normalizeLines(card.innerText) + .filter((line) => !remove.has(line)) + .filter((line) => !/^\\d+\\s*\\/\\s*\\d+$/.test(line)) + .join('\\n') + .trim(); + }; + const loginWallVisible = () => { + const text = document.body ? document.body.innerText || '' : ''; + return location.pathname.includes('/login') + || (/登录或注册 Threads/.test(text) && /用 Instagram 登录/.test(text)) + || (/Log in or sign up/.test(text) && /Instagram/.test(text)); + }; + const extractRows = () => { + const rows = []; + const seen = new Set(); + const postLinks = Array.from(document.links).filter((link) => isPostHref(link.getAttribute('href') || '')); + for (const link of postLinks) { + const url = absoluteUrl(link.getAttribute('href') || link.href); + if (!url || seen.has(url)) continue; + const card = findCard(link); + if (!card) continue; + const profile = Array.from(card.querySelectorAll('a')).find((item) => isProfileHref(item.getAttribute('href') || '')); + const profileHref = profile && profile.getAttribute('href') || ''; + const username = profileHref.replace(/^\\/@/, '').replace(/\\/$/, ''); + const displayName = null; + const timeEl = card.querySelector('time'); + const visibleTime = normalize(timeEl && timeEl.innerText); + const timestamp = parseTimestamp(timeEl && (timeEl.getAttribute('title') || timeEl.innerText)); + const text = cleanPostText(card, username, displayName, visibleTime); + if (!username || !text) continue; + rows.push({ + username, + displayName, + text, + timestamp, + url, + replyCount: actionCount(card, ['回复', '评论']), + repostCount: actionCount(card, ['转发']), + likeCount: actionCount(card, ['赞']), + }); + seen.add(url); + if (rows.length >= limit) break; + } + return rows; + }; + + const deadline = Date.now() + 12000; + while (Date.now() < deadline) { + if (loginWallVisible()) return { authRequired: true, rows: [] }; + const rows = extractRows(); + if (rows.length > 0) return { rows }; + await sleep(400); + } + if (loginWallVisible()) return { authRequired: true, rows: [] }; + return { timeout: true, rows: extractRows() }; + })() +`; + +export const searchCommand = cli({ + site: 'threads', + name: 'search', + access: 'read', + description: 'Search Threads posts using the logged-in browser session', + domain: 'www.threads.com', + strategy: Strategy.UI, + navigateBefore: false, + browser: true, + args: [ + { name: 'query', positional: true, required: true, help: 'Search keyword' }, + { name: 'limit', type: 'int', default: DEFAULT_LIMIT, help: `Max results (1-${MAX_LIMIT})` }, + ], + columns: ['rank', 'username', 'displayName', 'text', 'timestamp', 'url', 'replyCount', 'repostCount', 'likeCount'], + func: async (page, args) => { + const query = normalizeQuery(args.query); + const limit = normalizeLimit(args.limit); + + await page.goto(buildSearchUrl(query), { allowBoundNavigation: true, settleMs: 2000 }); + const result = await page.evaluateWithArgs(EXTRACT_SEARCH_ROWS_SCRIPT, { limit }); + if (!result || typeof result !== 'object') { + throw new CommandExecutionError('Threads search response shape changed: extractor returned no object'); + } + if (result.authRequired) { + throw new AuthRequiredError('www.threads.com', 'Not logged in to Threads in the active Chrome profile'); + } + const rows = normalizeExtractedRows(result.rows, limit); + if (rows.length === 0) { + if (result.timeout) throw new TimeoutError('Threads search results', 12); + throw new EmptyResultError('threads search', `No post results for "${query}"`); + } + return rows; + }, +}); + +export const __test__ = { + buildSearchUrl, + normalizeExtractedRows, + normalizeLimit, + normalizeQuery, + parseCompactCount, + parseThreadsTimestamp, +}; diff --git a/clis/threads/search.test.js b/clis/threads/search.test.js new file mode 100644 index 000000000..99922c3b7 --- /dev/null +++ b/clis/threads/search.test.js @@ -0,0 +1,99 @@ +import { describe, expect, it, vi } from 'vitest'; +import { ArgumentError, AuthRequiredError, TimeoutError } from '@jackwener/opencli/errors'; +import { __test__, searchCommand } from './search.js'; + +function createPageMock(result) { + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluateWithArgs: vi.fn().mockResolvedValue(result), + }; +} + +describe('threads search helpers', () => { + it('validates query and limit without silent clamping', () => { + expect(__test__.normalizeQuery(' openai ')).toBe('openai'); + expect(__test__.normalizeLimit(undefined)).toBe(20); + expect(__test__.normalizeLimit(50)).toBe(50); + expect(() => __test__.normalizeQuery(' ')).toThrow(ArgumentError); + expect(() => __test__.normalizeLimit(0)).toThrow(ArgumentError); + expect(() => __test__.normalizeLimit(51)).toThrow(ArgumentError); + }); + + it('builds a conservative Threads search URL', () => { + expect(__test__.buildSearchUrl('openai')).toBe('https://www.threads.com/search?q=openai&serp_type=default'); + }); + + it('parses compact engagement counts', () => { + expect(__test__.parseCompactCount('1,046')).toBe(1046); + expect(__test__.parseCompactCount('1.5 万')).toBe(15000); + expect(__test__.parseCompactCount('2.3K')).toBe(2300); + expect(__test__.parseCompactCount('')).toBeNull(); + expect(__test__.parseCompactCount('reply')).toBeNull(); + }); + + it('normalizes extracted rows to adapter columns', () => { + const rows = __test__.normalizeExtractedRows([ + { + username: 'openai', + displayName: '', + text: 'Research update', + timestamp: '2026-04-15T00:00:00.000Z', + url: 'https://www.threads.com/@openai/post/DXHjavPlGfH', + replyCount: 10, + repostCount: null, + likeCount: 68, + }, + { username: '', text: 'bad', url: 'https://example.com' }, + ], 20); + expect(rows).toEqual([ + { + rank: 1, + username: 'openai', + displayName: null, + text: 'Research update', + timestamp: '2026-04-15T00:00:00.000Z', + url: 'https://www.threads.com/@openai/post/DXHjavPlGfH', + replyCount: 10, + repostCount: null, + likeCount: 68, + }, + ]); + }); +}); + +describe('threads search command', () => { + it('navigates to search and returns extracted rows', async () => { + const page = createPageMock({ + rows: [ + { + username: 'openai', + displayName: null, + text: 'Another season, another year of making history together.', + timestamp: '2026-04-15T00:00:00.000Z', + url: 'https://www.threads.com/@openai/post/DXHjavPlGfH', + replyCount: 10, + repostCount: 3, + likeCount: 68, + }, + ], + }); + + const rows = await searchCommand.func(page, { query: 'openai', limit: 5 }); + expect(page.goto).toHaveBeenCalledWith( + 'https://www.threads.com/search?q=openai&serp_type=default', + { allowBoundNavigation: true, settleMs: 2000 }, + ); + expect(page.evaluateWithArgs).toHaveBeenCalledWith(expect.any(String), { limit: 5 }); + expect(rows[0]).toMatchObject({ rank: 1, username: 'openai', likeCount: 68 }); + }); + + it('reports auth-required when Threads shows a login wall', async () => { + const page = createPageMock({ authRequired: true, rows: [] }); + await expect(searchCommand.func(page, { query: 'openai', limit: 5 })).rejects.toThrow(AuthRequiredError); + }); + + it('reports timeout when no rows render before the extractor deadline', async () => { + const page = createPageMock({ timeout: true, rows: [] }); + await expect(searchCommand.func(page, { query: 'openai', limit: 5 })).rejects.toThrow(TimeoutError); + }); +}); diff --git a/designs/threads-login-search-adapter.md b/designs/threads-login-search-adapter.md new file mode 100644 index 000000000..a4ab35b16 --- /dev/null +++ b/designs/threads-login-search-adapter.md @@ -0,0 +1,418 @@ +# Threads Login-State Search Adapter — Design Plan + +**Date**: 2026-05-11 + +**Status**: Proposed + +**Target command**: `opencli threads search` + +--- + +## Problem Statement + +The user wants OpenCLI agents to search Meta Threads content without using the official Threads API Keyword Search permission. The adapter should use the user's existing Threads login state in Chrome and return structured search results that an agent can consume. + +This is not a replacement for the official API. It is a browser-backed adapter for data that the logged-in user can already view in the Threads web UI. + +As of this plan, the primary web domain should be treated as `www.threads.com`. Legacy `threads.net` URLs may redirect and should be observed during recon, but new adapter code should start from the current web domain unless live evidence says otherwise. + +## Goals + +1. Search Threads posts by keyword using the user's logged-in Chrome session. +2. Return normalized rows with stable columns: author, text, timestamp, URL, and engagement metadata when available. +3. Prefer HTTP/JSON data captured from the Threads web app over brittle DOM scraping. +4. Keep the first implementation narrow enough to verify and repair when Threads changes. +5. Avoid bypassing permissions, scraping private content, or building high-volume crawling behavior. + +## Non-Goals + +- Do not use `https://graph.threads.net/keyword_search`. +- Do not require a Meta developer app or `threads_keyword_search` permission. +- Do not search content unavailable to the logged-in user in the normal Threads UI. +- Do not implement large-scale pagination or historical archive collection in the first version. +- Do not mix profile search and post search in one command unless the web endpoint naturally returns both and fields are clearly separated. + +## User-Facing Command + +First version: + +```bash +opencli threads search "openai" --limit 20 -f json +``` + +Likely options: + +| Arg | Type | Default | Notes | +|-----|------|---------|-------| +| `query` | positional string | required | Search keyword entered in Threads search UI | +| `limit` | int | `20` | Clamp to a conservative maximum, probably `50` | +| `tab` | string | `top` | Optional if Threads exposes `top` / `recent` / similar tabs | +| `cursor` | string | none | Only add after the first endpoint is understood | +| `raw` | boolean | `false` | Optional debugging escape hatch for field discovery | + +Output columns: + +| Column | Meaning | +|--------|---------| +| `rank` | 1-based rank in returned result order | +| `username` | Threads username / handle | +| `displayName` | Display name if available | +| `text` | Post text, trimmed | +| `timestamp` | ISO timestamp if available; otherwise relative UI time only as fallback | +| `url` | Threads post permalink | +| `replyCount` | Replies, nullable | +| `repostCount` | Reposts, nullable | +| `likeCount` | Likes, nullable | +| `isReply` | Boolean or nullable | +| `isQuote` | Boolean or nullable | + +Minimum acceptable first version columns: + +```text +rank, username, text, timestamp, url +``` + +The adapter should not invent counts. If the internal response does not expose a metric reliably, return `null` or omit that column from the first version. + +## Adapter Strategy + +Use `Strategy.INTERCEPT` if the search endpoint requires dynamic request headers or internal tokens that are easiest to capture from the live web app. + +Use `Strategy.COOKIE` only if recon proves the endpoint can be called from `page.evaluate()` with ordinary session cookies and stable headers. + +Use `Strategy.UI` only as a fallback if the site does not expose a reusable JSON/XHR response. UI scraping should be the last resort because Threads can change DOM structure frequently. + +Recommended first implementation: + +```js +import { cli, Strategy } from '@jackwener/opencli/registry'; + +cli({ + site: 'threads', + name: 'search', + access: 'read', + description: 'Search Threads posts using the logged-in browser session', + domain: 'www.threads.com', + strategy: Strategy.INTERCEPT, + browser: true, + args: [ + { name: 'query', positional: true, required: true, help: 'Search keyword' }, + { name: 'limit', type: 'int', default: 20, help: 'Max results' }, + ], + columns: ['rank', 'username', 'displayName', 'text', 'timestamp', 'url', 'replyCount', 'repostCount', 'likeCount'], + func: async (page, args) => { + // Implementation depends on recon output. + }, +}); +``` + +## Discovery Plan + +Follow the `opencli-adapter-author` workflow. Do not guess internal Threads endpoints. + +### 1. Validate Browser Bridge + +```bash +opencli doctor +``` + +Required outcome: + +- Daemon is running. +- Chrome extension is connected. +- Browser commands can open and inspect pages. + +If doctor fails, fix browser bridge first. This is not an adapter problem. + +### 2. Confirm Login State + +Open Threads in the browser-backed workspace: + +```bash +opencli browser open https://www.threads.com/ +opencli browser state -f json +``` + +Required outcome: + +- Page shows logged-in Threads UI. +- No login wall, checkpoint, CAPTCHA, or "try again later" page. + +If login is missing, the user must log in through Chrome manually. The adapter should not automate login. + +### 3. Recon Search UI + +Run a manual search in a live browser session: + +```bash +opencli browser open "https://www.threads.com/search?q=openai" +opencli browser state -f json +opencli browser network --detail -f json +``` + +If URL search parameters do not trigger results, drive the UI: + +```bash +opencli browser find "Search" +opencli browser type "openai" +opencli browser network --detail -f json +``` + +Inspect network requests and identify: + +- Search endpoint URL. +- HTTP method. +- Required query/body parameters. +- Required request headers. +- Whether the response is JSON, GraphQL JSON, Relay payload, or streamed chunks. +- Pagination cursor, if present. +- Response fields for post text, author, permalink, timestamp, and counts. + +### 4. Endpoint Verification + +Before writing the adapter, call the candidate endpoint from the browser context with the exact observed parameters. + +Success criteria: + +- HTTP status is 200. +- Response contains search results. +- At least one result can be matched visually against the Threads UI. +- A second keyword also returns plausible results. + +Failure handling: + +| Failure | Meaning | Next Step | +|---------|---------|-----------| +| 401 / 403 | Missing cookie, token, or header | Compare captured request headers and cookies | +| HTML response | Wrong endpoint or navigation page | Return to network discovery | +| Empty results for obvious query | Missing parameter, wrong tab, or wrong query encoding | Compare with live search request | +| CAPTCHA / checkpoint | Human verification required | Stop; do not code around it | +| Rate limit | Site-side throttling | Stop and retry later | + +### 5. Field Decode + +Decode fields against visible UI values. For one known search result, verify: + +- `username` matches the visible handle. +- `text` matches the visible post content. +- `url` opens the same post. +- `timestamp` is either an absolute timestamp from JSON or a defensible conversion from response data. +- Counts match UI values, if counts are included. + +Do not rely only on "non-empty result" verification. A wrong nested field can still produce plausible but incorrect output. + +## Implementation Plan + +### Phase 1: Private Adapter Prototype + +Create a private adapter first: + +```bash +opencli browser init threads/search +``` + +This should generate: + +```text +~/.opencli/clis/threads/search.js +``` + +Build the first prototype there. This allows fast iteration without touching the public repo adapter list. + +Prototype responsibilities: + +1. Validate `query` is non-empty. +2. Validate `limit` is a bounded integer. +3. Navigate to Threads search UI or a neutral Threads page. +4. Trigger search or replay the discovered request. +5. Normalize result rows. +6. Throw typed errors for auth, empty results, timeout, and response shape failures. + +### Phase 2: Stable Fetch Path + +Prefer this control flow if the endpoint is replayable: + +```text +page.goto("https://www.threads.com/") + -> page.evaluate(fetch search endpoint with credentials: "include") + -> parse JSON + -> normalize rows +``` + +This is simpler than UI scraping and easier to test. + +Use `Strategy.COOKIE` if this path is enough. + +### Phase 3: Intercept Path + +Use this if the endpoint requires dynamic headers or tokens: + +```text +page.goto("https://www.threads.com/search") + -> install network listener + -> type query into search box + -> wait for matching XHR/fetch request + -> capture response body + -> normalize rows +``` + +Use `Strategy.INTERCEPT` for this path. + +The matching rule should be specific enough to avoid unrelated feed requests. Match by endpoint path plus request parameters, not only by host. + +### Phase 4: UI Fallback + +Use only if no usable response body is available. + +Control flow: + +```text +page.goto(search page) + -> type query + -> wait for visible result articles + -> scroll until limit or timeout + -> extract visible text, links, handles, timestamps +``` + +This version should return fewer columns: + +```text +rank, username, text, timestamp, url +``` + +Do not include engagement counts unless they are reliably visible and parsable. + +## Error Handling + +Use typed errors where possible: + +| Case | Error | +|------|-------| +| Empty query | `ArgumentError` | +| Invalid limit | `ArgumentError` | +| Not logged in | `AuthRequiredError` | +| No matching results | `EmptyResultError` | +| Search request never appears | `TimeoutError` or `CommandExecutionError` | +| Response shape changed | `CommandExecutionError` | +| CAPTCHA/checkpoint/rate limit | `CommandExecutionError` with clear user-facing message | + +Avoid silent fallbacks: + +- Do not return `[]` for auth failure. +- Do not return sentinel rows. +- Do not silently clamp bad user input without telling the user through validation. + +## Verification Plan + +### Local Validation + +```bash +opencli validate threads/search +``` + +Expected: + +- Command is registered. +- Columns match returned row keys. +- Args are valid. + +### Browser Verification + +Run with live trace during development: + +```bash +opencli browser verify threads/search --trace on --live --focus +``` + +After first success, write a fixture: + +```bash +opencli browser verify threads/search --write-fixture +``` + +Then tighten the fixture: + +- Require non-empty `username`, `text`, and `url`. +- Require URL pattern matching Threads post permalinks. +- Require row count to be within a small expected range for a stable test keyword. +- Treat counts as nullable unless proven stable. + +### Manual Cross-Check + +For a known query: + +1. Run adapter with `-f json`. +2. Open the first result URL in browser. +3. Confirm username, text, and timestamp match. +4. Confirm the visible result appears in Threads search UI. + +This is required before moving from private adapter to public `clis/threads/search.js`. + +## Site Memory + +After verification, write local site memory: + +```text +~/.opencli/sites/threads/endpoints.json +~/.opencli/sites/threads/field-map.json +~/.opencli/sites/threads/notes.md +~/.opencli/sites/threads/verify/search.json +~/.opencli/sites/threads/fixtures/search-.json +``` + +Rules: + +- Strip cookies, auth headers, and account-specific private fields from fixtures. +- Record endpoint URL patterns, required params, response shape, and `verified_at`. +- Append notes rather than overwriting older discoveries. +- Store raw discovery dumps only under `~/.opencli/sites/threads/fixtures/` or `/tmp`. + +## Public Repo Integration + +Only after the private adapter is verified: + +1. Copy implementation to `clis/threads/search.js`. +2. Add tests if helper functions are factored out. +3. Add `docs/adapters/browser/threads.md` with usage and prerequisites. +4. Run: + +```bash +opencli validate threads/search +npm test +``` + +For a narrower local pass during iteration, use `npm run test:adapter`. + +## Open Questions + +1. Does Threads expose a direct search URL such as `/search?q=...`, or must the adapter drive the search box? +2. Are post search and profile search separate tabs/endpoints? +3. Does the internal response include stable absolute timestamps? +4. Are engagement counts present in the search response, or only on post detail pages? +5. Does pagination require a cursor from the response or repeated UI scrolling? +6. Does the endpoint host use `threads.com`, legacy `threads.net`, `instagram.com`, or a GraphQL relay domain? + +These should be answered by live recon before implementation. + +## Risk Assessment + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Threads changes internal endpoint | Adapter breaks | Use trace-based repair and site memory | +| Login checkpoint appears | Command cannot run | Report auth issue, do not bypass | +| DOM changes | UI fallback breaks | Prefer network response path | +| Response fields are obfuscated | Field mapping can be wrong | Cross-check against visible UI | +| Rate limiting | Temporary failures | Keep conservative limits and no bulk crawling | +| Search personalization | Results differ across users | Document that results reflect logged-in account | + +## Recommended First Cut + +Implement the first cut as a private `Strategy.INTERCEPT` adapter with conservative output: + +```text +rank, username, displayName, text, timestamp, url +``` + +Add engagement columns only after recon proves they are available in the same search response. Avoid post-detail fan-out in version 1 because that turns one search into many requests and increases rate-limit risk. + +Once the endpoint and fields are stable across at least two queries, move the adapter into `clis/threads/search.js` and document it as a browser-login command. diff --git a/openspec/changes/threads-login-search-adapter/.openspec.yaml b/openspec/changes/threads-login-search-adapter/.openspec.yaml new file mode 100644 index 000000000..81cd71fe0 --- /dev/null +++ b/openspec/changes/threads-login-search-adapter/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-11 diff --git a/openspec/changes/threads-login-search-adapter/design.md b/openspec/changes/threads-login-search-adapter/design.md new file mode 100644 index 000000000..dd4e49f9c --- /dev/null +++ b/openspec/changes/threads-login-search-adapter/design.md @@ -0,0 +1,46 @@ +## Context + +OpenCLI already supports browser-backed adapters that run against a user's Chrome session. Threads search is different from public HTTP adapters because the useful search surface depends on the logged-in web UI and internal web responses, while the official keyword search API requires permissions that this change explicitly avoids. + +The source design in `designs/threads-login-search-adapter.md` treats `www.threads.com` as the primary domain, with `threads.net` only as a legacy redirect source to observe during recon. The adapter must only expose content the user can already view in the normal Threads UI. + +## Goals / Non-Goals + +**Goals:** +- Provide `opencli threads search ` with bounded result collection. +- Use the logged-in Chrome session through OpenCLI's browser bridge. +- Prefer reusable JSON or GraphQL-style web responses captured from Threads over DOM scraping. +- Normalize search results into stable agent-readable rows. +- Fail clearly when login, checkpoint, throttling, endpoint, timeout, or response-shape assumptions do not hold. + +**Non-Goals:** +- Do not use `https://graph.threads.net/keyword_search`. +- Do not automate login or bypass checkpoints, CAPTCHA, permissions, or private-content boundaries. +- Do not implement high-volume crawling, deep pagination, or archival collection in the first version. +- Do not merge profile search and post search unless the observed endpoint naturally returns both and the fields are clearly separated. + +## Decisions + +1. Use `Strategy.INTERCEPT` for the first implementation unless recon proves a simpler cookie-backed fetch is stable. + + Rationale: Threads web search is likely backed by dynamic headers, tokens, GraphQL payloads, or Relay responses. Intercepting the live web app gives the adapter better evidence before replaying a request. Alternative considered: `Strategy.COOKIE` with a direct `page.evaluate(fetch(...))`; this remains acceptable if recon verifies ordinary cookies and stable headers are sufficient. + +2. Implement a narrow post-search command before pagination or profile search. + + Rationale: The first adapter needs to be repairable when Threads changes. A small command surface with `query` and `limit` reduces ambiguity and avoids adding cursor semantics before the endpoint contract is known. Alternative considered: add `tab`, `cursor`, and `raw` immediately; those are deferred until field discovery confirms they are useful. + +3. Normalize only fields that can be decoded from observed data. + + Rationale: Agents need stable columns, but fabricated or guessed metrics are worse than missing data. The minimum acceptable output is `rank`, `username`, `text`, `timestamp`, and `url`; engagement counts are nullable when absent or unreliable. Alternative considered: scrape visible counts from DOM as a fallback; this is too brittle for the initial JSON-first path. + +4. Treat live recon as a required implementation step. + + Rationale: Internal Threads endpoints and response shapes are not stable public contracts. Before final adapter code is committed, the endpoint, request parameters, required headers, response format, and field mapping must be validated against visible UI results for at least two queries. + +## Risks / Trade-offs + +- Threads login is missing or checkpointed -> report an auth/checkpoint error and stop; the adapter must not automate login. +- Threads changes internal endpoints or field names -> keep normalization isolated and return response-shape errors with enough context to repair. +- JSON response contains mixed result types -> include post rows only unless profile rows are explicitly modeled later. +- Counts or timestamps are absent or encoded unexpectedly -> return `null` rather than inventing values. +- Live recon cannot be performed in CI -> cover pure input validation and normalization helpers with fixture-driven tests; verify the full adapter manually with `opencli browser verify`. diff --git a/openspec/changes/threads-login-search-adapter/proposal.md b/openspec/changes/threads-login-search-adapter/proposal.md new file mode 100644 index 000000000..ea61e9d1f --- /dev/null +++ b/openspec/changes/threads-login-search-adapter/proposal.md @@ -0,0 +1,26 @@ +## Why + +Agents need a way to search Threads content that the user can already view in their logged-in Chrome session, without depending on the official Threads API Keyword Search permission. A browser-backed OpenCLI adapter gives agents structured search results while keeping the scope narrow and tied to normal user-visible access. + +## What Changes + +- Add a `threads search` OpenCLI command that searches Threads posts by keyword through the logged-in browser session. +- Return normalized result rows with stable fields for rank, author identity, post text, timestamp, permalink, and engagement counts when reliable. +- Prefer captured or replayed Threads web JSON responses over DOM scraping. +- Validate query and limit inputs, with conservative bounds. +- Handle unauthenticated, checkpoint, rate-limit, empty-result, timeout, and response-shape failures with typed errors. +- Do not use the official `graph.threads.net/keyword_search` API or require Meta developer credentials. + +## Capabilities + +### New Capabilities +- `threads-login-search`: Browser-backed Threads post search using the user's existing web login state. + +### Modified Capabilities + +## Impact + +- Adds a new Threads adapter command under the OpenCLI adapter surface. +- May add or update adapter manifest/build coverage for the new command. +- Adds focused tests for input validation and normalization logic where feasible without live Threads access. +- Requires live recon against `www.threads.com` before finalizing the endpoint strategy. diff --git a/openspec/changes/threads-login-search-adapter/specs/threads-login-search/spec.md b/openspec/changes/threads-login-search-adapter/specs/threads-login-search/spec.md new file mode 100644 index 000000000..08f8a6c7c --- /dev/null +++ b/openspec/changes/threads-login-search-adapter/specs/threads-login-search/spec.md @@ -0,0 +1,70 @@ +## ADDED Requirements + +### Requirement: Search Threads posts through browser login state +The system SHALL provide a `threads search` command that searches Threads posts using the user's existing logged-in browser session. + +#### Scenario: Search uses logged-in browser access +- **WHEN** the user runs `opencli threads search "openai" --limit 20 -f json` while logged in to Threads in Chrome +- **THEN** the command returns Threads post search results visible to that logged-in user + +#### Scenario: Unauthenticated session stops safely +- **WHEN** the user runs `opencli threads search "openai"` without a usable Threads login session +- **THEN** the command fails with an authentication-oriented error instead of attempting login automation + +### Requirement: Avoid official keyword search API +The system MUST NOT use the official `graph.threads.net/keyword_search` endpoint or require Meta developer credentials for the browser-backed search command. + +#### Scenario: Adapter performs browser-backed search +- **WHEN** the command performs a search request +- **THEN** it uses the Threads web session or web UI evidence rather than the official keyword search API + +### Requirement: Validate search inputs +The system SHALL require a non-empty keyword query and SHALL clamp or reject excessive limits using conservative bounds. + +#### Scenario: Missing query is rejected +- **WHEN** the user runs `opencli threads search ""` +- **THEN** the command fails with an input validation error before navigating or sending a search request + +#### Scenario: Limit is bounded +- **WHEN** the user requests more than the supported maximum number of results +- **THEN** the command uses the supported maximum or reports a validation error without high-volume collection + +### Requirement: Return normalized result rows +The system SHALL return structured rows for post results with stable fields for rank, username, display name, text, timestamp, permalink URL, and engagement metadata when available. + +#### Scenario: Minimum row fields are present +- **WHEN** Threads returns at least one post search result +- **THEN** each normalized row includes `rank`, `username`, `text`, `timestamp`, and `url` + +#### Scenario: Optional metrics are not invented +- **WHEN** reply, repost, or like counts are absent or unreliable in the observed response +- **THEN** the command returns `null` or omits those optional metrics rather than fabricating values + +### Requirement: Prefer structured web responses +The system SHALL prefer JSON, GraphQL, Relay, or other structured web responses captured from the Threads web app over DOM scraping. + +#### Scenario: Structured response is available +- **WHEN** the Threads search UI emits a structured response containing post results +- **THEN** the adapter decodes results from that response rather than scraping rendered DOM text + +#### Scenario: Response shape changes +- **WHEN** the observed structured response no longer contains the required fields +- **THEN** the command fails with a response-shape error that can be used for adapter repair + +### Requirement: Handle web access failures safely +The system SHALL detect login walls, checkpoints, CAPTCHA, rate limiting, empty results, timeouts, and unexpected response shapes without bypassing site controls. + +#### Scenario: Human verification appears +- **WHEN** Threads presents a checkpoint or CAPTCHA during search +- **THEN** the command reports that human verification is required and stops + +#### Scenario: No posts match query +- **WHEN** Threads returns a successful search response with no post results +- **THEN** the command returns an empty result set or an explicit empty-result status without treating it as a parser failure + +### Requirement: Scope first version conservatively +The system SHALL limit the first implementation to keyword post search and SHALL NOT implement large-scale pagination or archival collection. + +#### Scenario: Cursor support is unavailable +- **WHEN** the user requests behavior beyond the first supported result page +- **THEN** the command keeps collection within the bounded first-version scope diff --git a/openspec/changes/threads-login-search-adapter/tasks.md b/openspec/changes/threads-login-search-adapter/tasks.md new file mode 100644 index 000000000..f95b4634f --- /dev/null +++ b/openspec/changes/threads-login-search-adapter/tasks.md @@ -0,0 +1,21 @@ +## 1. Recon + +- [x] 1.1 Run `opencli doctor` and confirm the browser bridge is healthy. +- [x] 1.2 Check existing Threads site memory and adapter references for reusable endpoint or field information. +- [x] 1.3 Confirm the user is logged in at `https://www.threads.com/` and no checkpoint, CAPTCHA, or login wall blocks normal UI access. +- [x] 1.4 Run Threads search UI recon for at least two queries and capture the candidate structured response endpoint, method, parameters, headers, pagination signals, and result shape. +- [x] 1.5 Verify the candidate endpoint or intercepted response against visible UI values for one known result. + +## 2. Adapter Implementation + +- [x] 2.1 Scaffold or add the `threads/search` adapter with `query` and bounded `limit` inputs. +- [x] 2.2 Implement browser-backed search using the verified Threads web response path without calling `graph.threads.net/keyword_search`. +- [x] 2.3 Normalize post rows to stable output columns, returning nullable optional metrics when unavailable. +- [x] 2.4 Add typed handling for validation, unauthenticated session, checkpoint/CAPTCHA, rate limit, timeout, empty results, and response-shape failures. + +## 3. Verification + +- [x] 3.1 Add focused fixture/unit coverage for validation and normalization helpers where live Threads access is not required. +- [x] 3.2 Run `opencli browser verify threads/search` with live browser access and compare at least one row against the visible Threads UI. +- [x] 3.3 Record verified endpoint and field notes in Threads site memory after live verification succeeds. +- [x] 3.4 Run focused tests and build/manifest checks needed for the touched adapter files. diff --git a/openspec/config.yaml b/openspec/config.yaml new file mode 100644 index 000000000..392946c67 --- /dev/null +++ b/openspec/config.yaml @@ -0,0 +1,20 @@ +schema: spec-driven + +# Project context (optional) +# This is shown to AI when creating artifacts. +# Add your tech stack, conventions, style guides, domain knowledge, etc. +# Example: +# context: | +# Tech stack: TypeScript, React, Node.js +# We use conventional commits +# Domain: e-commerce platform + +# Per-artifact rules (optional) +# Add custom rules for specific artifacts. +# Example: +# rules: +# proposal: +# - Keep proposals under 500 words +# - Always include a "Non-goals" section +# tasks: +# - Break tasks into chunks of max 2 hours