Skip to content

Commit 4f8a63a

Browse files
committed
add bulk_load script
1 parent 1d17786 commit 4f8a63a

File tree

4 files changed

+118
-6
lines changed

4 files changed

+118
-6
lines changed

arxiv_daemon.py

-3
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,6 @@ def store(p):
5252
try:
5353
resp = get_response(search_query=q, start_index=k)
5454
papers = parse_response(resp)
55-
print(papers[0].keys())
56-
print(papers[0].items())
57-
input()
5855
time.sleep(0.5)
5956
if len(papers) == 100:
6057
break # otherwise we have to try again

aslite/arxiv.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,4 @@ def filter_latest_version(idvs):
7878
pid_to_v[pid] = max(int(v), pid_to_v.get(pid, 0))
7979

8080
filt = [f"{pid}v{v}" for pid, v in pid_to_v.items()]
81-
return filt
81+
return filt

aslite/db.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def get_metas_db(flag='r', autocommit=True):
114114
mdb = SqliteDict(PAPERS_DB_FILE, tablename='metas', flag=flag, autocommit=autocommit)
115115
return mdb
116116

117-
def get_tags_db(flag='r', autocommit=True):
117+
def get_tags_db(flag='c', autocommit=True):
118118
assert flag in ['r', 'c']
119119
tdb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit)
120120
return tdb
@@ -124,7 +124,7 @@ def get_last_active_db(flag='r', autocommit=True):
124124
ladb = SqliteDict(DICT_DB_FILE, tablename='last_active', flag=flag, autocommit=autocommit)
125125
return ladb
126126

127-
def get_email_db(flag='r', autocommit=True):
127+
def get_email_db(flag='c', autocommit=True):
128128
assert flag in ['r', 'c']
129129
edb = SqliteDict(DICT_DB_FILE, tablename='email', flag=flag, autocommit=autocommit)
130130
return edb

bulk_load.py

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import time
2+
import logging
3+
import urllib.request
4+
from collections import OrderedDict
5+
import xml.etree.ElementTree as ET
6+
from aslite.db import get_papers_db, get_metas_db
7+
8+
9+
def get_response(category, resumption_token=None):
10+
oai_url = f'http://export.arxiv.org/oai2?verb=ListRecords'
11+
if resumption_token is not None:
12+
oai_url += f'&resumptionToken={resumption_token}'
13+
else:
14+
oai_url += f"&set={category}&metadataPrefix=arXiv"
15+
16+
with urllib.request.urlopen(oai_url) as url:
17+
response = url.read()
18+
19+
if url.status != 200:
20+
logger.error(f"arxiv did not return status 200 response")
21+
22+
return response
23+
24+
def parse(response):
25+
categories = ["cs.CV", "cs.LG", "cs.CL", "cs.AI", "cs.NE", "cs.RO", "cs.IT"]
26+
pdb = get_papers_db(flag="c")
27+
mdb = get_metas_db(flag="c")
28+
root = ET.fromstring(response)
29+
for t in root.iter("{http://www.openarchives.org/OAI/2.0/}record"):
30+
data = {}
31+
metadata = t.find("{http://www.openarchives.org/OAI/2.0/}metadata/{http://arxiv.org/OAI/arXiv/}arXiv")
32+
data["id"] = f"http://arxiv.org/abs/{metadata.find('{http://arxiv.org/OAI/arXiv/}id').text}"
33+
data["guidislink"] = True
34+
data["link"] = data["id"]
35+
data["published"] = metadata.find("{http://arxiv.org/OAI/arXiv/}created").text
36+
data["updated"] = metadata.find("{http://arxiv.org/OAI/arXiv/}updated")
37+
if data["updated"] is None:
38+
data["updated"] = data["published"]
39+
else:
40+
data["updated"] = data["updated"].text
41+
data["published_parsed"] = time.strptime(data["published"], "%Y-%m-%d")
42+
data["updated_parsed"] = time.strptime(data["updated"], "%Y-%m-%d")
43+
data["title"] = metadata.find("{http://arxiv.org/OAI/arXiv/}title").text
44+
data["summary"] = metadata.find("{http://arxiv.org/OAI/arXiv/}abstract").text
45+
data["authors"] = []
46+
47+
for author in metadata.find("{http://arxiv.org/OAI/arXiv/}authors"):
48+
if (forename := author.find("{http://arxiv.org/OAI/arXiv/}forename")) is None:
49+
forename = author.find("{http://arxiv.org/OAI/arXiv/}forenames")
50+
if forename is not None:
51+
name = author.find("{http://arxiv.org/OAI/arXiv/}keyname").text + forename.text
52+
else:
53+
name = author.find("{http://arxiv.org/OAI/arXiv/}keyname").text
54+
data["authors"].append({
55+
"name": name
56+
})
57+
58+
data["author_detail"] = data["authors"][-1]
59+
data["author"] = data["authors"][-1]
60+
data["links"] = [{
61+
"href": data["id"],
62+
"rel": "alternate",
63+
"type": "text/html"
64+
},{
65+
"title": "pdf",
66+
"href": data["id"].replace("abs", "pdf"),
67+
"rel": "related",
68+
"type": "application/pdf"
69+
}]
70+
data["arxiv_primary_category"] = {
71+
"term": metadata.find("{http://arxiv.org/OAI/arXiv/}categories").text.split(" ")[0],
72+
"scheme": "http://arxiv.org/schemas/atom"
73+
}
74+
data["tags"] = [
75+
{
76+
"term": cat,
77+
"scheme": "http://arxiv.org/schemas/atom",
78+
"label": None
79+
}
80+
for cat in metadata.find("{http://arxiv.org/OAI/arXiv/}categories").text.split(" ")
81+
]
82+
data["_idv"] = metadata.find("{http://arxiv.org/OAI/arXiv/}id").text
83+
data["_id"] = metadata.find("{http://arxiv.org/OAI/arXiv/}id").text
84+
data["_version"] = 1
85+
data["_time"] = time.mktime(data["updated_parsed"])
86+
data["_time_str"] = time.strftime("%b %d %Y", data["updated_parsed"])
87+
88+
correct_category = False
89+
for cat in data["tags"]:
90+
if cat["term"] in categories:
91+
correct_category = True
92+
break
93+
if correct_category:
94+
pid = data["_id"]
95+
if pid in pdb:
96+
if data['_time'] > pdb[pid]['_time']:
97+
pdb[data['_id']] = data
98+
mdb[data['_id']] = {'_time': data['_time']}
99+
else:
100+
pdb[data['_id']] = data
101+
mdb[data['_id']] = {'_time': data['_time']}
102+
103+
return root.find("{http://www.openarchives.org/OAI/2.0/}ListRecords/{http://www.openarchives.org/OAI/2.0/}resumptionToken").text
104+
105+
106+
107+
if __name__ == "__main__":
108+
response = get_response("cs")
109+
while True:
110+
resumption_token = parse(response)
111+
print("Resumption Token", resumption_token)
112+
if resumption_token is None:
113+
break
114+
time.sleep(5)
115+
response = get_response("cs", resumption_token)

0 commit comments

Comments
 (0)