bitcoinsearch
diff --git a/‎bitcoinbook/main.py
Lines changed: 18 additions & 36 deletions b/‎bitcoinbook/main.py
Lines changed: 18 additions & 36 deletions
diff --git a/‎bitcoinops/main.js
Lines changed: 27 additions & 5 deletions b/‎bitcoinops/main.js
Lines changed: 27 additions & 5 deletions
diff --git a/‎bitcointalk/index.js
Lines changed: 44 additions & 16 deletions b/‎bitcointalk/index.js
Lines changed: 44 additions & 16 deletions
diff --git a/‎bitcointranscripts/main.js
Lines changed: 33 additions & 19 deletions b/‎bitcointranscripts/main.js
Lines changed: 33 additions & 19 deletions
@@ -3,38 +3,33 @@
 import re
 import requests
 from datetime import datetime
-
-def get_github_urls(base_url: str) -> list:
-    """
-    get a list of urls 
-    """
-
-    urls = []
-    for chapter in chapters:
-        urls.append(base_url + chapter)
-
-    return urls
+from loguru import logger
 
 
+if __name__ == "__main__":
 
-def parse_chapters(urls):
+    site = 'https://github.com/bitcoinbook/bitcoinbook/blob/develop'
+    chapters = ['/ch01.asciidoc', '/ch02.asciidoc', '/ch03.asciidoc', '/ch04.asciidoc',
+                '/ch05.asciidoc', '/ch06.asciidoc', '/ch07.asciidoc', '/ch08.asciidoc',
+                '/ch09.asciidoc', '/ch10.asciidoc', '/ch11.asciidoc', '/ch12.asciidoc']
+    chapter_links = [f"{site}{chapter}" for chapter in chapters]
 
-    for url in urls:
+    documents = []
+    for url in chapter_links:
         data = requests.get(url).text
-        soup = BeautifulSoup(data,'html.parser')
-        document = {}
+        soup = BeautifulSoup(data, 'html.parser')
         title = soup.find('h2', dir='auto').get_text()
-        body = soup.find('div',id = 'readme').get_text()
+        body = soup.find('div', id='readme').get_text()
         body_type = "md"
         author = "Andreas Antonopoulous"
         chapter_number = ''.join(re.findall(r'\d+', url))
         id = "bitcoinbook-chapter-" + chapter_number
         tags = ""
         domain = "https://github.com/bitcoinbook/bitcoinbook"
         url = url
-        created_at = "2022-11-15" # date of most recent commit
+        created_at = "2022-11-15"  # date of most recent commit
 
-        document.update({
+        document = {
             "title": title,
             "body": body,
             "body_type": body_type,
@@ -45,27 +40,14 @@ def parse_chapters(urls):
             "url": url,
             "created_at": created_at,
             "indexed_at": datetime.utcnow().isoformat()
-            })
-        print(document.get("id"))
-        documents.append(document)
+        }
 
-if __name__ == "__main__":
-
-    documents = []
-
-    chapters = ['/ch01.asciidoc','/ch02.asciidoc','/ch03.asciidoc','/ch04.asciidoc',
-            '/ch05.asciidoc','/ch06.asciidoc','/ch07.asciidoc','/ch08.asciidoc',
-            '/ch09.asciidoc','/ch10.asciidoc','/ch11.asciidoc','/ch12.asciidoc']
-
-    site = 'https://github.com/bitcoinbook/bitcoinbook/blob/develop'
-    print("Getting links for bitcoin book")
-    chapter_links = get_github_urls(site)
-    parse_chapters(chapter_links)
-    print ("Number of documents: " + str(len(documents)))
+        logger.info(document.get("id"))
+        documents.append(document)
+    print("Number of documents: " + str(len(documents)))
 
     with open("bitcoinbook.json", "w") as f:
-      json.dump(documents, f, indent=4)
+        json.dump(documents, f, indent=4)
 
     # Close the file
     f.close()
-
@@ -4,9 +4,14 @@ const AdmZip = require('adm-zip');
 const path = require('path');
 const request = require('request');
 const yaml = require('js-yaml');
-const { basename } = require('path');
+const {basename} = require('path');
 const marked = require('marked');
-const { create_batches, index_documents } = require('../common/elasticsearch-scraper/util');
+const {
+    checkDocumentExist,
+    create_document,
+    delete_document_if_exist,
+    document_view
+} = require('../common/elasticsearch-scraper/util');
 
 dotenv.config();
 
@@ -16,9 +21,9 @@ async function download_repo() {
     const URL = "https://github.com/bitcoinops/bitcoinops.github.io/archive/refs/heads/master.zip";
     const dir = path.join(process.env.DATA_DIR, "bitcoinops");
     if (!fs.existsSync(dir)) {
-        fs.mkdirSync(dir);
+        fs.mkdirSync(dir, {recursive: true});
     }
-    
+
     if (fs.existsSync(path.join(dir, folder_name))) {
         console.log("Repo already downloaded");
         return;
@@ -102,6 +107,7 @@ function parse_post(path, topic = false) {
 
     const stringRepresentation = parsedBody.map(obj => JSON.stringify(obj)).join(', ');
     const frontMatterObj = yaml.load(frontMatter);
+    const indexed_at = new Date().toISOString();
     const document = {
         id: "bitcoinops-" + (topic ? basename(path, '.md') : frontMatterObj.slug),
         title: frontMatterObj.title,
@@ -114,6 +120,7 @@ function parse_post(path, topic = false) {
         type: topic ? 'topic' : frontMatterObj.type,
         language: frontMatterObj.lang,
         authors: ["bitcoinops"],
+        indexed_at: indexed_at
     };
 
     return document;
@@ -138,7 +145,22 @@ async function main() {
 
     console.log(`Parsed ${documents.length} documents`);
 
-    await index_documents(documents);
+    let count = 0;
+    for (let i = 0; i < documents.length; i++) {
+        const document = documents[i];
+
+//        // delete posts with previous logic where '_id' was set on its own and replace them with our logic
+//        const deleteId = await delete_document_if_exist(document.id)
+
+        const viewResponse = await document_view(document.id);
+        if (!viewResponse) {
+            const createResponse = await create_document(document);
+            count++;
+        }
+
+    }
+    console.log(`Inserted ${count} new documents`);
+
 }
 
 main();
@@ -1,31 +1,38 @@
-const BOARD = 'https://bitcointalk.org/index.php?board=6.';
 
 const cheerio = require('cheerio');
 const fs = require('fs');
 const path = require('path');
 const dotenv = require('dotenv');
 dotenv.config();
 
-const { index_documents, fetch_with_retry } = require('../common/util');
+const {index_documents, fetch_with_retry,} = require('../common/util');
+const {
+    checkDocumentExist,
+    create_document,
+    delete_document_if_exist,
+    document_view
+} = require('../common/elasticsearch-scraper/util')
 
-const URL = process.env.URL || BOARD;
+const BOARD_URL = 'https://bitcointalk.org/index.php?board=6.';
+
+authors = ['achow101', 'kanzure', 'Sergio_Demian_Lerner', 'Nicolas Dorier', 'jl2012', 'Peter Todd', 'Gavin Andresen', 'adam3us', 'Pieter Wuille', 'Meni Rosenfeld', 'Mike Hearn', 'wumpus', 'Luke-Jr', 'Matt Corallo', 'jgarzik', 'andytoshi', 'satoshi', 'Cdecker', 'TimRuffing', 'gmaxwell'];
 
 async function fetch_all_topics() {
     if (!fs.existsSync(path.join(process.env.DATA_DIR, 'bitcointalk'))) {
-        fs.mkdirSync(path.join(process.env.DATA_DIR, 'bitcointalk'));
+        fs.mkdirSync(path.join(process.env.DATA_DIR, 'bitcointalk'), {recursive: true});
     }
     let offset = 0;
     const topics = [];
     while (true) {
-        console.log(`Downloading page ${offset/40}...`);
-        const url = URL + offset;
+        console.log(`Downloading page ${offset / 40}...`);
+        const url = BOARD_URL + offset;
         let success = false;
         let tops = [];
-        while(!success) {
+        while (!success) {
             const response = await fetch(url);
             const text = await response.text();
-            if(response.status !== 200) {
-                console.log(`Error ${response.status} downloading page ${offset/20}`);
+            if (response.status !== 200) {
+                console.log(`Error ${response.status} downloading page ${offset / 20}`);
                 await new Promise(resolve => setTimeout(resolve, 2000));
                 continue;
             }
@@ -87,9 +94,15 @@ async function get_documents_from_post(url) {
 
     for (const tr of trList) {
         const author = $(tr).find('.poster_info > b > a').text();
+
+        if (!authors.includes(author)) {
+            continue;
+        }
+        console.log(`post by : ${author}`)
+
         // text without title attribute
         let date = $(tr).find('.td_headerandpost .smalltext > .edited').text();
-        if(date === '') {
+        if (date === '') {
             date = $(tr).find('.td_headerandpost .smalltext').text();
         }
 
@@ -113,24 +126,26 @@ async function get_documents_from_post(url) {
         body = body.text();
 
         const dateJs = new Date(date);
+        const indexed_at = new Date().toISOString();
 
         const id = url.substring(url.indexOf('#msg') + 4);
 
         const document = {
             authors: [author],
             body,
             body_type: 'raw',
-            domain: 'https://bitcointalk.org',
+            domain: 'https://bitcointalk.org/',
             url,
             title,
             id: 'bitcointalk-' + id,
             created_at: dateJs,
+            indexed_at: indexed_at,
             type: messageNumber === "#1" ? "topic" : "post",
         }
 
         documents.push(document);
     }
-
+    console.log(`Filtered ${documents.length} posts in ${url}`);
     return {documents, urls};
 }
 
@@ -164,16 +179,29 @@ async function main() {
     }
 
     console.log(`Found ${topics.length} topics`);
-
+    let count = 0;
     const start_index = process.env.START_INDEX ? parseInt(process.env.START_INDEX) : 0;
-
     for (let i = start_index; i < topics.length; i++) {
         const topic = topics[i];
-        console.log(`Processing ${i+1}/${topics.length}`);
+        console.log(`Processing ${i + 1}/${topics.length}`);
         const documents = await fetch_posts(topic);
 
-        await index_documents(documents);
+        for (let i = 0; i < documents.length; i++) {
+            const document = documents[i];
+
+//            // delete posts with previous logic where '_id' was set on its own and replace them with our logic
+//            const deleteId = await delete_document_if_exist(document.id)
+
+            const viewResponse = await document_view(document.id);
+            if (!viewResponse) {
+                const createResponse = await create_document(document);
+                count++;
+            }
+
+        }
+
     }
+    console.log(`Inserted ${count} new documents`);
 }
 
 main().catch(console.error);
@@ -5,8 +5,9 @@ const path = require('path');
 const request = require('request');
 const yaml = require('js-yaml');
 const marked = require('marked');
-const { checkDocumentExist, create_document} = require('../common/elasticsearch-scraper/util.js');
+const {delete_document_if_exist, create_document, document_view} = require('../common/elasticsearch-scraper/util.js');
 const md5 = require('md5');
+const { log } = require('console');
 
 dotenv.config();
 
@@ -51,21 +52,25 @@ function parse_posts(dir) {
     let documents = [];
     const files = fs.readdirSync(dir);
     for (const file of files) {
-        if (fs.statSync(path.join(dir, file)).isDirectory()) {
-            if (file === '.github') continue;
-            documents = documents.concat(parse_posts(path.join(dir, file)));
-            continue
+        try{
+            if (fs.statSync(path.join(dir, file)).isDirectory()) {
+                if (file === '.github') continue;
+                documents = documents.concat(parse_posts(path.join(dir, file)));
+                continue
+            }
+    
+            if (dir === path.join(process.env.DATA_DIR, "bitcointranscripts", folder_name)) continue;
+            if (file.startsWith('_')) continue;
+            // Skip if file ends with .??.md (skip translations)
+            if (file.match(/\.([a-z][a-z])\.md$/)) continue;
+            if (!file.endsWith('.md')) continue;
+    
+            console.log(`Parsing ${path.join(dir, file)}...`);
+            const document = parse_post(path.join(dir, file));
+            documents.push(document);
+        }catch{
+            continue;
         }
-
-        if (dir === path.join(process.env.DATA_DIR, "bitcointranscripts", folder_name)) continue;
-        if (file.startsWith('_')) continue;
-        // Skip if file ends with .??.md (skip translations)
-        if (file.match(/\.([a-z][a-z])\.md$/)) continue;
-        if (!file.endsWith('.md')) continue;
-
-        console.log(`Parsing ${path.join(dir, file)}...`);
-        const document = parse_post(path.join(dir, file));
-        documents.push(document);
     }
     return documents;
 }
@@ -112,6 +117,7 @@ function parse_post(p_path) {
     const frontMatterObj = yaml.load(frontMatter);
     const id = md5(pathWithoutExtension.replace(path.join(process.env.DATA_DIR, "bitcointranscripts", folder_name), '').replaceAll("/", "-")).substring(0, 20);
     const stringParsedBodyRepresentation = parsedBody.map(obj => JSON.stringify(obj)).join(', ');
+    const indexed_at = new Date().toISOString();
     const document = {
         id: "bitcointranscripts-" + id,
         title: frontMatterObj.title,
@@ -125,6 +131,7 @@ function parse_post(p_path) {
         tags: frontMatterObj.tags,
         media: frontMatterObj.media,
         authors: frontMatterObj.speakers,
+        indexed_at: indexed_at,
         transcript_by: frontMatterObj.transcript_by,
     };
 
@@ -135,18 +142,25 @@ async function main() {
     await download_repo();
     const dir = path.join(process.env.DATA_DIR, "bitcointranscripts", folder_name);
     const documents = parse_posts(dir);
-    let count = 0;
 
     console.log(`Filtering existing ${documents.length} documents... please wait...`);
+
+    let count = 0;
     for (let i = 0; i < documents.length; i++) {
         const document = documents[i];
-        const isExist = await checkDocumentExist(document.id);
-        if (!isExist) {
+
+//        // delete posts with previous logic where '_id' was set on its own and replace them with our logic
+//        const deleteId = await delete_document_if_exist(document.id)
+
+        const viewResponse = await document_view(document.id);
+        if (!viewResponse) {
+            const createResponse = await create_document(document);
             count++;
-            await create_document(document);
         }
+
     }
     console.log(`Inserted ${count} new documents`);
+
 }
 
 main();