Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions src/core/citability.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,125 @@ Furthermore, we plan to add more features.`;
const stats = result.dimensions.find(d => d.name === 'Statistical Density')!;
expect(stats.score).toBeGreaterThanOrEqual(12);
});

it('suggests attribution when statistical claims lack evidence signals', () => {
const statsContent = `Our platform serves 50,000 users across 120 countries. Revenue grew 250% in 2024. We process $2.5 million in transactions daily with 99.99% uptime.`;
const result = scorePageCitability(makePage(statsContent));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});

it('does not suggest attribution when statistical claims include evidence signals', () => {
const statsContent = `According to the 2024 Benchmark Report, our platform serves 50,000 users across 120 countries. Revenue grew 250% in 2024. Source: https://example.com/report`;
const result = scorePageCitability(makePage(statsContent));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeUndefined();
});

it('still suggests attribution for self-referential phrasing that mentions reports/studies', () => {
// "our internal report" / "based on our data" / "user behavior study" are
// self-referential — they should NOT silence the attribution hint.
const selfReferential = `Based on our internal data, we grew 300% in 2024. See our quarterly report for more numbers — our user behavior study found 80% retention.`;
const result = scorePageCitability(makePage(selfReferential));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});

it('recognizes "study by" / "report from" attribution phrases as evidence', () => {
const externalAttribution = `A 2024 study by Stanford found 80% adoption. Revenue figures came from a report from McKinsey, showing 250% growth.`;
const result = scorePageCitability(makePage(externalAttribution));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeUndefined();
});

it('still suggests attribution for "Citing our own data" — bare citing is self-referential', () => {
// The bare "citing" alternative used to swallow the hint here. Real attribution uses
// "cited by/in" or "as cited" — those are kept. "Citing our X" is self-referential.
const selfCiting = `Citing our own data, we grew 300% in 2024. Revenue grew 250% to $50 million daily.`;
const result = scorePageCitability(makePage(selfCiting));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});

it('still suggests attribution for "According to our internal data" — self-referential according', () => {
// "according to" used to be unscoped and silently passed for "according to our X". The
// negative lookahead excludes our/my/us and "the {company,team,organization,internal}".
const selfAccording = `According to our internal data, we grew 300% in 2024. Revenue grew 250% to $50 million daily.`;
const result = scorePageCitability(makePage(selfAccording));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});

it('recognizes "according to" an external source as evidence', () => {
const externalAccording = `According to Gartner, the market grew 40% in 2024. Revenue figures reached $50 million daily.`;
const result = scorePageCitability(makePage(externalAccording));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeUndefined();
});

it('still suggests attribution for "data from our research team" — self-referential data-from', () => {
// "data from" used to be unguarded — "Data from our research team" would silently
// suppress the hint. Same negative-lookahead scoping as "according to".
const selfData = `Data from our research team shows we grew 300% in 2024. Revenue grew 250% to $50 million daily.`;
const result = scorePageCitability(makePage(selfData));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});

it('recognizes "data from" an external source as evidence', () => {
const externalData = `Data from Bloomberg shows the market grew 40% in 2024. Revenue figures reached $50 million daily.`;
const result = scorePageCitability(makePage(externalData));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeUndefined();
});

it('still suggests attribution for "study by our team" and "reported by our CEO"', () => {
// Same self-referential guard for study/report/research-by-from and reported/published-by-in.
const selfStudy = `In a study by our team, we grew 300% in 2024. Reported by our CEO, revenue reached $50 million daily.`;
const result = scorePageCitability(makePage(selfStudy));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});

it('recognizes "According to US regulators" as external evidence (not self-referential)', () => {
// /i flag was making 'us' in NOT_SELF_REF match 'US' (United States), incorrectly
// flagging legitimate citations like "According to US regulators". 'us' was dropped.
const usCitation = `According to US regulators, the market grew 40% in 2024. Revenue figures reached $50 million daily across 120 countries.`;
const result = scorePageCitability(makePage(usCitation));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeUndefined();
});

it('still suggests attribution for "As cited in our internal report"', () => {
// 'as cited' alone is meaningless without 'in/by SOURCE' — covered by the
// 'cited (by|in)' pattern with the self-referential guard.
const selfAsCited = `As cited in our internal report, we grew 300% in 2024 across 120 countries. Daily transactions reached $50 million.`;
const result = scorePageCitability(makePage(selfAsCited));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});

it('still suggests attribution for "cited by our internal research team"', () => {
const selfCited = `Revenue data cited by our internal research team shows we grew 300% in 2024 across 120 countries. Daily transactions reached $50 million.`;
const result = scorePageCitability(makePage(selfCited));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});

it('recognizes "cited by" an external source as evidence', () => {
const externalCited = `Findings cited by Stanford show the market grew 40% in 2024. Revenue figures reached $50 million daily.`;
const result = scorePageCitability(makePage(externalCited));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeUndefined();
});

it('still suggests attribution for "Sources: our internal team"', () => {
// The sources?: label is a strong signal but only when followed by an external source.
const selfSources = `We grew 300% in 2024. Revenue reached $50 million daily across 120 countries. Sources: our internal team and our research department.`;
const result = scorePageCitability(makePage(selfSources));
const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution'));
expect(evidenceHint).toBeDefined();
});
});

describe('scoreSiteCitability', () => {
Expand Down
31 changes: 31 additions & 0 deletions src/core/citability.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,42 @@ function scoreStatisticalDensity(content: string, hints: ContentHint[]): Citabil

if (totalMatches === 0) {
hints.push({ type: 'suggestion', message: 'No statistics or factual claims found — AI favors content with concrete numbers, percentages, and dates' });
} else if (!hasEvidenceSignals(content)) {
hints.push({ type: 'suggestion', message: 'Add source links or attribution for statistical claims so AI systems can verify them' });
}

return { name: 'Statistical Density', score, maxScore: 25, details: `${totalMatches} statistical claims (${density.toFixed(1)} per 100 words)` };
}

// Shared negative-lookahead fragment used by every attribution pattern. The lookahead
// is anchored immediately after the keyword and consumes any whitespace internally so
// the engine can't backtrack \s* to 0 chars and slip a self-referential token through.
// Excludes "our/my" and "the {company,team,organization,internal}" so phrases like
// "according to our CEO" or "study from our team" don't masquerade as evidence.
// 'us' is intentionally omitted — the /i flag makes it also match "US" (United States),
// which would mis-flag legitimate external sources like "According to US regulators".
// The bare "data from us" / "according to us" cases are uncommon and not worth the
// collision with US-government citations.
const NOT_SELF_REF = String.raw`(?!\s*(?:our|my|the\s+(?:company|team|organization|internal))\b)`;

function hasEvidenceSignals(content: string): boolean {
const evidencePatterns = [
// External-source markers: URLs, footnote refs, or explicit attribution phrases.
// Every keyword pair must be followed by something that isn't self-referential —
// "our internal report", "data from us", "study by our team" should NOT count.
/https?:\/\/\S+/i,
/\[\^?\d+\]/,
new RegExp(String.raw`\baccording to\b${NOT_SELF_REF}`, 'i'),
new RegExp(String.raw`\bsources?:${NOT_SELF_REF}`, 'i'),
new RegExp(String.raw`\b(reported|published) (by|in)\b${NOT_SELF_REF}`, 'i'),
new RegExp(String.raw`\b(study|survey|report|research|paper|analysis) (by|from)\b${NOT_SELF_REF}`, 'i'),
new RegExp(String.raw`\bdata (from|by)\b${NOT_SELF_REF}`, 'i'),
new RegExp(String.raw`\bcited (by|in)\b${NOT_SELF_REF}`, 'i'),
];

return evidencePatterns.some(pattern => pattern.test(content));
}

/**
* Dimension 4: Structure Quality (0-25)
* Checks heading hierarchy, lists, short paragraphs, summary/TL;DR.
Expand Down