Skip to content

Commit 08a65ac

Browse files
committed
Add candidatesFilters option
1 parent 3f5ee1e commit 08a65ac

File tree

5 files changed

+568
-4
lines changed

5 files changed

+568
-4
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,23 @@ read(url, {
8181
});
8282
```
8383

84+
- `candidatesFilters` which allow set your own filters for candidate tags.
85+
86+
options.candidatesFilters = [callback(obj, index)]
87+
```javascript
88+
read(url, {
89+
candidatesFilters: [
90+
function (obj) {
91+
if (obj.tagName === 'ARTICLE' && elem.getAttribute('type') === 'video') {
92+
return false;
93+
}
94+
return true;
95+
}
96+
]}, function(err, article, response) {
97+
//...
98+
});
99+
```
100+
84101
- `preprocess` which should be a function to check or modify downloaded source before passing it to readability.
85102

86103
options.preprocess = callback(source, response, contentType, callback);

src/helpers.js

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,16 @@ exports.debug = function(debug) {
2222
};
2323

2424
var cleanRules = [];
25+
var candidatesFilters = [];
2526

2627
module.exports.setCleanRules = function(rules) {
2728
cleanRules = rules;
2829
};
2930

31+
module.exports.setCandidatesFilters = function(filters) {
32+
candidatesFilters = filters;
33+
};
34+
3035
/**
3136
* Prepare the HTML document for readability to scrape it.
3237
* This includes things like stripping javascript, CSS, and handling terrible markup.
@@ -65,7 +70,7 @@ var prepDocument = module.exports.prepDocument = function(document) {
6570
}
6671
}
6772
}
68-
73+
6974
// Strip out all <script> tags, as they *should* be useless
7075
var scripts = document.getElementsByTagName('script');
7176
[].forEach.call(scripts, function (node) {
@@ -182,6 +187,12 @@ var grabArticle = module.exports.grabArticle = function(document, preserveUnlike
182187
grandParentNode.readability.contentScore += contentScore / 2;
183188
}
184189

190+
if (candidatesFilters.length) {
191+
candidatesFilters.forEach(function(filterBy) {
192+
candidates = candidates.filter(filterBy);
193+
});
194+
}
195+
185196

186197
/**
187198
* After we've calculated scores, loop through all of the possible candidate nodes we found

src/readability.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ function Readability(window, options) {
1818
this.bodyCache = null;
1919
this._articleContent = '';
2020
helpers.setCleanRules(options.cleanRulers || []);
21+
helpers.setCandidatesFilters(options.candidatesFilters || []);
2122

2223
this.cache = {};
2324

test/article-tests.js

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ describe('Regression Tests', function() {
1616
],
1717
notInclude: [
1818
'Donate to Wikipedia'
19-
]
19+
],
2020
},
2121
{
2222
fixture: 'mediashift',
@@ -80,11 +80,48 @@ describe('Regression Tests', function() {
8080
'最赞回应',
8181
'最新话题',
8282
'北京豆网科技有限公司',
83-
]
83+
],
84+
},
85+
{
86+
fixture: 'ifeng',
87+
title: '熊玲:什么样的婚姻才是鸡肋婚姻?',
88+
include: [
89+
'沃尔沃“憋”不住了,最高狂降8万,性能不输BBA,白菜价愣没人',
90+
'打开APP',
91+
],
92+
notInclude: [
93+
'它是“迷恋婚姻又排拒婚姻”的一种复杂婚姻情感心理状态。它意味着即便你有千条理由走出婚姻,背后却有万种吸引力把你留在围城里。',
94+
'在婚姻十字路口的人,你若要想你们的关系和好如初,就必须有重修婚姻的姿态,即必须有妥协的态度。',
95+
'重修婚姻的办法很多很多,但最简单也是最核心的办法只有一个,那就是接受。',
96+
],
97+
},
98+
{
99+
fixture: 'ifeng',
100+
title: '熊玲:什么样的婚姻才是鸡肋婚姻?',
101+
include: [
102+
'它是“迷恋婚姻又排拒婚姻”的一种复杂婚姻情感心理状态。它意味着即便你有千条理由走出婚姻,背后却有万种吸引力把你留在围城里。',
103+
'在婚姻十字路口的人,你若要想你们的关系和好如初,就必须有重修婚姻的姿态,即必须有妥协的态度。',
104+
'重修婚姻的办法很多很多,但最简单也是最核心的办法只有一个,那就是接受。',
105+
],
106+
notInclude: [
107+
'沃尔沃“憋”不住了,最高狂降8万,性能不输BBA,白菜价愣没人',
108+
'打开APP',
109+
],
110+
options: {
111+
candidatesFilters: [
112+
function (elem) {
113+
if (elem.tagName === 'ARTICLE' && elem.getAttribute('type') === 'video') {
114+
return false;
115+
}
116+
117+
return true;
118+
}
119+
],
120+
},
84121
}].forEach(function(testCase) {
85122
it('can extract ' + testCase.fixture + ' articles', function(done) {
86123
var html = fs.readFileSync(articleFixtures + '/' + testCase.fixture + '.html').toString();
87-
read(html, function(error, article) {
124+
read(html, testCase.options || {}, function(error, article) {
88125
if(error) {
89126
done(error)
90127
} else {

0 commit comments

Comments
 (0)