Skip to content

Commit 34f5d42

Browse files
urvishp80adamjonas
authored andcommitted
update scrapers:
- custom '_id' for each doc - added 'indexed_at' field - insert only new data to ES rather than inserting all from scratch - remove redundant code
1 parent c746ec1 commit 34f5d42

File tree

9 files changed

+248
-121
lines changed

9 files changed

+248
-121
lines changed

bitcoinbook/main.py

Lines changed: 18 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,38 +3,33 @@
33
import re
44
import requests
55
from datetime import datetime
6-
7-
def get_github_urls(base_url: str) -> list:
8-
"""
9-
get a list of urls
10-
"""
11-
12-
urls = []
13-
for chapter in chapters:
14-
urls.append(base_url + chapter)
15-
16-
return urls
6+
from loguru import logger
177

188

9+
if __name__ == "__main__":
1910

20-
def parse_chapters(urls):
11+
site = 'https://github.com/bitcoinbook/bitcoinbook/blob/develop'
12+
chapters = ['/ch01.asciidoc', '/ch02.asciidoc', '/ch03.asciidoc', '/ch04.asciidoc',
13+
'/ch05.asciidoc', '/ch06.asciidoc', '/ch07.asciidoc', '/ch08.asciidoc',
14+
'/ch09.asciidoc', '/ch10.asciidoc', '/ch11.asciidoc', '/ch12.asciidoc']
15+
chapter_links = [f"{site}{chapter}" for chapter in chapters]
2116

22-
for url in urls:
17+
documents = []
18+
for url in chapter_links:
2319
data = requests.get(url).text
24-
soup = BeautifulSoup(data,'html.parser')
25-
document = {}
20+
soup = BeautifulSoup(data, 'html.parser')
2621
title = soup.find('h2', dir='auto').get_text()
27-
body = soup.find('div',id = 'readme').get_text()
22+
body = soup.find('div', id='readme').get_text()
2823
body_type = "md"
2924
author = "Andreas Antonopoulous"
3025
chapter_number = ''.join(re.findall(r'\d+', url))
3126
id = "bitcoinbook-chapter-" + chapter_number
3227
tags = ""
3328
domain = "https://github.com/bitcoinbook/bitcoinbook"
3429
url = url
35-
created_at = "2022-11-15" # date of most recent commit
30+
created_at = "2022-11-15" # date of most recent commit
3631

37-
document.update({
32+
document = {
3833
"title": title,
3934
"body": body,
4035
"body_type": body_type,
@@ -45,27 +40,14 @@ def parse_chapters(urls):
4540
"url": url,
4641
"created_at": created_at,
4742
"indexed_at": datetime.utcnow().isoformat()
48-
})
49-
print(document.get("id"))
50-
documents.append(document)
43+
}
5144

52-
if __name__ == "__main__":
53-
54-
documents = []
55-
56-
chapters = ['/ch01.asciidoc','/ch02.asciidoc','/ch03.asciidoc','/ch04.asciidoc',
57-
'/ch05.asciidoc','/ch06.asciidoc','/ch07.asciidoc','/ch08.asciidoc',
58-
'/ch09.asciidoc','/ch10.asciidoc','/ch11.asciidoc','/ch12.asciidoc']
59-
60-
site = 'https://github.com/bitcoinbook/bitcoinbook/blob/develop'
61-
print("Getting links for bitcoin book")
62-
chapter_links = get_github_urls(site)
63-
parse_chapters(chapter_links)
64-
print ("Number of documents: " + str(len(documents)))
45+
logger.info(document.get("id"))
46+
documents.append(document)
47+
print("Number of documents: " + str(len(documents)))
6548

6649
with open("bitcoinbook.json", "w") as f:
67-
json.dump(documents, f, indent=4)
50+
json.dump(documents, f, indent=4)
6851

6952
# Close the file
7053
f.close()
71-

bitcoinops/main.js

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,14 @@ const AdmZip = require('adm-zip');
44
const path = require('path');
55
const request = require('request');
66
const yaml = require('js-yaml');
7-
const { basename } = require('path');
7+
const {basename} = require('path');
88
const marked = require('marked');
9-
const { create_batches, index_documents } = require('../common/elasticsearch-scraper/util');
9+
const {
10+
checkDocumentExist,
11+
create_document,
12+
delete_document_if_exist,
13+
document_view
14+
} = require('../common/elasticsearch-scraper/util');
1015

1116
dotenv.config();
1217

@@ -16,9 +21,9 @@ async function download_repo() {
1621
const URL = "https://github.com/bitcoinops/bitcoinops.github.io/archive/refs/heads/master.zip";
1722
const dir = path.join(process.env.DATA_DIR, "bitcoinops");
1823
if (!fs.existsSync(dir)) {
19-
fs.mkdirSync(dir);
24+
fs.mkdirSync(dir, {recursive: true});
2025
}
21-
26+
2227
if (fs.existsSync(path.join(dir, folder_name))) {
2328
console.log("Repo already downloaded");
2429
return;
@@ -102,6 +107,7 @@ function parse_post(path, topic = false) {
102107

103108
const stringRepresentation = parsedBody.map(obj => JSON.stringify(obj)).join(', ');
104109
const frontMatterObj = yaml.load(frontMatter);
110+
const indexed_at = new Date().toISOString();
105111
const document = {
106112
id: "bitcoinops-" + (topic ? basename(path, '.md') : frontMatterObj.slug),
107113
title: frontMatterObj.title,
@@ -114,6 +120,7 @@ function parse_post(path, topic = false) {
114120
type: topic ? 'topic' : frontMatterObj.type,
115121
language: frontMatterObj.lang,
116122
authors: ["bitcoinops"],
123+
indexed_at: indexed_at
117124
};
118125

119126
return document;
@@ -138,7 +145,22 @@ async function main() {
138145

139146
console.log(`Parsed ${documents.length} documents`);
140147

141-
await index_documents(documents);
148+
let count = 0;
149+
for (let i = 0; i < documents.length; i++) {
150+
const document = documents[i];
151+
152+
// // delete posts with previous logic where '_id' was set on its own and replace them with our logic
153+
// const deleteId = await delete_document_if_exist(document.id)
154+
155+
const viewResponse = await document_view(document.id);
156+
if (!viewResponse) {
157+
const createResponse = await create_document(document);
158+
count++;
159+
}
160+
161+
}
162+
console.log(`Inserted ${count} new documents`);
163+
142164
}
143165

144166
main();

bitcointalk/index.js

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,38 @@
1-
const BOARD = 'https://bitcointalk.org/index.php?board=6.';
21

32
const cheerio = require('cheerio');
43
const fs = require('fs');
54
const path = require('path');
65
const dotenv = require('dotenv');
76
dotenv.config();
87

9-
const { index_documents, fetch_with_retry } = require('../common/util');
8+
const {index_documents, fetch_with_retry,} = require('../common/util');
9+
const {
10+
checkDocumentExist,
11+
create_document,
12+
delete_document_if_exist,
13+
document_view
14+
} = require('../common/elasticsearch-scraper/util')
1015

11-
const URL = process.env.URL || BOARD;
16+
const BOARD_URL = 'https://bitcointalk.org/index.php?board=6.';
17+
18+
authors = ['achow101', 'kanzure', 'Sergio_Demian_Lerner', 'Nicolas Dorier', 'jl2012', 'Peter Todd', 'Gavin Andresen', 'adam3us', 'Pieter Wuille', 'Meni Rosenfeld', 'Mike Hearn', 'wumpus', 'Luke-Jr', 'Matt Corallo', 'jgarzik', 'andytoshi', 'satoshi', 'Cdecker', 'TimRuffing', 'gmaxwell'];
1219

1320
async function fetch_all_topics() {
1421
if (!fs.existsSync(path.join(process.env.DATA_DIR, 'bitcointalk'))) {
15-
fs.mkdirSync(path.join(process.env.DATA_DIR, 'bitcointalk'));
22+
fs.mkdirSync(path.join(process.env.DATA_DIR, 'bitcointalk'), {recursive: true});
1623
}
1724
let offset = 0;
1825
const topics = [];
1926
while (true) {
20-
console.log(`Downloading page ${offset/40}...`);
21-
const url = URL + offset;
27+
console.log(`Downloading page ${offset / 40}...`);
28+
const url = BOARD_URL + offset;
2229
let success = false;
2330
let tops = [];
24-
while(!success) {
31+
while (!success) {
2532
const response = await fetch(url);
2633
const text = await response.text();
27-
if(response.status !== 200) {
28-
console.log(`Error ${response.status} downloading page ${offset/20}`);
34+
if (response.status !== 200) {
35+
console.log(`Error ${response.status} downloading page ${offset / 20}`);
2936
await new Promise(resolve => setTimeout(resolve, 2000));
3037
continue;
3138
}
@@ -87,9 +94,15 @@ async function get_documents_from_post(url) {
8794

8895
for (const tr of trList) {
8996
const author = $(tr).find('.poster_info > b > a').text();
97+
98+
if (!authors.includes(author)) {
99+
continue;
100+
}
101+
console.log(`post by : ${author}`)
102+
90103
// text without title attribute
91104
let date = $(tr).find('.td_headerandpost .smalltext > .edited').text();
92-
if(date === '') {
105+
if (date === '') {
93106
date = $(tr).find('.td_headerandpost .smalltext').text();
94107
}
95108

@@ -113,24 +126,26 @@ async function get_documents_from_post(url) {
113126
body = body.text();
114127

115128
const dateJs = new Date(date);
129+
const indexed_at = new Date().toISOString();
116130

117131
const id = url.substring(url.indexOf('#msg') + 4);
118132

119133
const document = {
120134
authors: [author],
121135
body,
122136
body_type: 'raw',
123-
domain: 'https://bitcointalk.org',
137+
domain: 'https://bitcointalk.org/',
124138
url,
125139
title,
126140
id: 'bitcointalk-' + id,
127141
created_at: dateJs,
142+
indexed_at: indexed_at,
128143
type: messageNumber === "#1" ? "topic" : "post",
129144
}
130145

131146
documents.push(document);
132147
}
133-
148+
console.log(`Filtered ${documents.length} posts in ${url}`);
134149
return {documents, urls};
135150
}
136151

@@ -164,16 +179,29 @@ async function main() {
164179
}
165180

166181
console.log(`Found ${topics.length} topics`);
167-
182+
let count = 0;
168183
const start_index = process.env.START_INDEX ? parseInt(process.env.START_INDEX) : 0;
169-
170184
for (let i = start_index; i < topics.length; i++) {
171185
const topic = topics[i];
172-
console.log(`Processing ${i+1}/${topics.length}`);
186+
console.log(`Processing ${i + 1}/${topics.length}`);
173187
const documents = await fetch_posts(topic);
174188

175-
await index_documents(documents);
189+
for (let i = 0; i < documents.length; i++) {
190+
const document = documents[i];
191+
192+
// // delete posts with previous logic where '_id' was set on its own and replace them with our logic
193+
// const deleteId = await delete_document_if_exist(document.id)
194+
195+
const viewResponse = await document_view(document.id);
196+
if (!viewResponse) {
197+
const createResponse = await create_document(document);
198+
count++;
199+
}
200+
201+
}
202+
176203
}
204+
console.log(`Inserted ${count} new documents`);
177205
}
178206

179207
main().catch(console.error);

bitcointranscripts/main.js

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@ const path = require('path');
55
const request = require('request');
66
const yaml = require('js-yaml');
77
const marked = require('marked');
8-
const { checkDocumentExist, create_document} = require('../common/elasticsearch-scraper/util.js');
8+
const {delete_document_if_exist, create_document, document_view} = require('../common/elasticsearch-scraper/util.js');
99
const md5 = require('md5');
10+
const { log } = require('console');
1011

1112
dotenv.config();
1213

@@ -51,21 +52,25 @@ function parse_posts(dir) {
5152
let documents = [];
5253
const files = fs.readdirSync(dir);
5354
for (const file of files) {
54-
if (fs.statSync(path.join(dir, file)).isDirectory()) {
55-
if (file === '.github') continue;
56-
documents = documents.concat(parse_posts(path.join(dir, file)));
57-
continue
55+
try{
56+
if (fs.statSync(path.join(dir, file)).isDirectory()) {
57+
if (file === '.github') continue;
58+
documents = documents.concat(parse_posts(path.join(dir, file)));
59+
continue
60+
}
61+
62+
if (dir === path.join(process.env.DATA_DIR, "bitcointranscripts", folder_name)) continue;
63+
if (file.startsWith('_')) continue;
64+
// Skip if file ends with .??.md (skip translations)
65+
if (file.match(/\.([a-z][a-z])\.md$/)) continue;
66+
if (!file.endsWith('.md')) continue;
67+
68+
console.log(`Parsing ${path.join(dir, file)}...`);
69+
const document = parse_post(path.join(dir, file));
70+
documents.push(document);
71+
}catch{
72+
continue;
5873
}
59-
60-
if (dir === path.join(process.env.DATA_DIR, "bitcointranscripts", folder_name)) continue;
61-
if (file.startsWith('_')) continue;
62-
// Skip if file ends with .??.md (skip translations)
63-
if (file.match(/\.([a-z][a-z])\.md$/)) continue;
64-
if (!file.endsWith('.md')) continue;
65-
66-
console.log(`Parsing ${path.join(dir, file)}...`);
67-
const document = parse_post(path.join(dir, file));
68-
documents.push(document);
6974
}
7075
return documents;
7176
}
@@ -112,6 +117,7 @@ function parse_post(p_path) {
112117
const frontMatterObj = yaml.load(frontMatter);
113118
const id = md5(pathWithoutExtension.replace(path.join(process.env.DATA_DIR, "bitcointranscripts", folder_name), '').replaceAll("/", "-")).substring(0, 20);
114119
const stringParsedBodyRepresentation = parsedBody.map(obj => JSON.stringify(obj)).join(', ');
120+
const indexed_at = new Date().toISOString();
115121
const document = {
116122
id: "bitcointranscripts-" + id,
117123
title: frontMatterObj.title,
@@ -125,6 +131,7 @@ function parse_post(p_path) {
125131
tags: frontMatterObj.tags,
126132
media: frontMatterObj.media,
127133
authors: frontMatterObj.speakers,
134+
indexed_at: indexed_at,
128135
transcript_by: frontMatterObj.transcript_by,
129136
};
130137

@@ -135,18 +142,25 @@ async function main() {
135142
await download_repo();
136143
const dir = path.join(process.env.DATA_DIR, "bitcointranscripts", folder_name);
137144
const documents = parse_posts(dir);
138-
let count = 0;
139145

140146
console.log(`Filtering existing ${documents.length} documents... please wait...`);
147+
148+
let count = 0;
141149
for (let i = 0; i < documents.length; i++) {
142150
const document = documents[i];
143-
const isExist = await checkDocumentExist(document.id);
144-
if (!isExist) {
151+
152+
// // delete posts with previous logic where '_id' was set on its own and replace them with our logic
153+
// const deleteId = await delete_document_if_exist(document.id)
154+
155+
const viewResponse = await document_view(document.id);
156+
if (!viewResponse) {
157+
const createResponse = await create_document(document);
145158
count++;
146-
await create_document(document);
147159
}
160+
148161
}
149162
console.log(`Inserted ${count} new documents`);
163+
150164
}
151165

152166
main();

0 commit comments

Comments
 (0)