Skip to content

Commit 357472a

Browse files
committed
github actions, update dependencies, fix upload by using AsyncHTTPClient
1 parent 25e9ccc commit 357472a

File tree

4 files changed

+57
-48
lines changed

4 files changed

+57
-48
lines changed

.drone.yml

Lines changed: 0 additions & 33 deletions
This file was deleted.

.github/workflows/tests.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
pull_request:
8+
9+
jobs:
10+
tests:
11+
runs-on: ubuntu-latest
12+
services:
13+
es:
14+
image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2
15+
ports:
16+
- 9200:9200
17+
options: >-
18+
--env http.port=9200
19+
--env discovery.type=single-node
20+
21+
steps:
22+
- name: Checkout code
23+
uses: actions/checkout@v4
24+
25+
- name: Set up Python
26+
uses: actions/setup-python@v5
27+
with:
28+
python-version: 3.12
29+
30+
- name: Install dependencies
31+
run: |
32+
pip install --upgrade pip
33+
pip install -r requirements.txt
34+
35+
- name: Wait for Elasticsearch
36+
run: |
37+
sleep 10
38+
curl -s http://localhost:9200
39+
40+
- name: Run tests
41+
run: python3 src/index_emails.py --infile=sample.mbox --es-url=http://localhost:9200
42+

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/curre
1717

1818
A quick way to run Elasticsearch is using Docker: (the cors settings aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index)
1919
```
20-
docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1
20+
docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2
2121
```
2222

2323
I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Also `beautifulsoup4` for the stripping HTML/JS/CSS (if you want to use the body indexing flag).

src/index_emails.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from tornado.httpclient import HTTPClient, HTTPRequest
1+
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
22
from tornado.ioloop import IOLoop
33
import tornado.options
44
import json
@@ -12,7 +12,7 @@
1212
from bs4 import BeautifulSoup
1313
import logging
1414

15-
http_client = HTTPClient()
15+
http_client = AsyncHTTPClient()
1616

1717
DEFAULT_BATCH_SIZE = 500
1818
DEFAULT_ES_URL = "http://localhost:9200"
@@ -34,17 +34,17 @@ def strip_html_css_js(msg):
3434
return text
3535

3636

37-
def delete_index():
37+
async def delete_index():
3838
try:
3939
url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name)
4040
request = HTTPRequest(url, method="DELETE", request_timeout=240, headers={"Content-Type": "application/json"})
41-
response = http_client.fetch(request)
41+
response = await http_client.fetch(request)
4242
logging.info('Delete index done %s' % response.body)
4343
except:
4444
pass
4545

4646

47-
def create_index():
47+
async def create_index():
4848

4949
schema = {
5050
"settings": {
@@ -71,7 +71,7 @@ def create_index():
7171
url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name)
7272
try:
7373
request = HTTPRequest(url, method="PUT", body=body, request_timeout=240, headers={"Content-Type": "application/json"})
74-
response = http_client.fetch(request)
74+
response = await http_client.fetch(request)
7575
logging.info('Create index done %s' % response.body)
7676
except:
7777
pass
@@ -80,7 +80,7 @@ def create_index():
8080
total_uploaded = 0
8181

8282

83-
def upload_batch(upload_data):
83+
async def upload_batch(upload_data):
8484
if tornado.options.options.dry_run:
8585
logging.info("Dry run, not uploading")
8686
return
@@ -97,7 +97,7 @@ def upload_batch(upload_data):
9797
upload_data_txt += json_item
9898

9999
request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240, headers={"Content-Type": "application/json"})
100-
response = http_client.fetch(request)
100+
response = await http_client.fetch(request)
101101
result = json.loads(response.body)
102102

103103
global total_uploaded
@@ -170,11 +170,11 @@ def parse_message_parts(current_msg):
170170
return result
171171

172172

173-
def load_from_file():
173+
async def load_from_file():
174174

175175
if tornado.options.options.init:
176-
delete_index()
177-
create_index()
176+
await delete_index()
177+
await create_index()
178178

179179
if tornado.options.options.skip:
180180
logging.info("Skipping first %d messages" % tornado.options.options.skip)
@@ -198,12 +198,12 @@ def load_from_file():
198198
if item:
199199
upload_data.append(item)
200200
if len(upload_data) == tornado.options.options.batch_size:
201-
upload_batch(upload_data)
201+
await upload_batch(upload_data)
202202
upload_data = list()
203203

204204
# upload remaining items in `upload_batch`
205205
if upload_data:
206-
upload_batch(upload_data)
206+
await upload_batch(upload_data)
207207

208208
logging.info("Import done - total count %d" % len(mbox.keys()))
209209

@@ -249,7 +249,7 @@ def load_from_file():
249249

250250
tornado.options.parse_command_line()
251251

252-
#Exactly one of {infile, indir} must be set
252+
# Exactly one of {infile, indir} must be set
253253
if bool(tornado.options.options.infile) ^ bool(tornado.options.options.indir):
254254
IOLoop.instance().run_sync(load_from_file)
255255
else:

0 commit comments

Comments
 (0)