1
+ import time
2
+ import logging
3
+ import urllib .request
4
+ from collections import OrderedDict
5
+ import xml .etree .ElementTree as ET
6
+ from aslite .db import get_papers_db , get_metas_db
7
+
8
+
9
+ def get_response (category , resumption_token = None ):
10
+ oai_url = f'http://export.arxiv.org/oai2?verb=ListRecords'
11
+ if resumption_token is not None :
12
+ oai_url += f'&resumptionToken={ resumption_token } '
13
+ else :
14
+ oai_url += f"&set={ category } &metadataPrefix=arXiv"
15
+
16
+ with urllib .request .urlopen (oai_url ) as url :
17
+ response = url .read ()
18
+
19
+ if url .status != 200 :
20
+ logger .error (f"arxiv did not return status 200 response" )
21
+
22
+ return response
23
+
24
+ def parse (response ):
25
+ categories = ["cs.CV" , "cs.LG" , "cs.CL" , "cs.AI" , "cs.NE" , "cs.RO" , "cs.IT" ]
26
+ pdb = get_papers_db (flag = "c" )
27
+ mdb = get_metas_db (flag = "c" )
28
+ root = ET .fromstring (response )
29
+ for t in root .iter ("{http://www.openarchives.org/OAI/2.0/}record" ):
30
+ data = {}
31
+ metadata = t .find ("{http://www.openarchives.org/OAI/2.0/}metadata/{http://arxiv.org/OAI/arXiv/}arXiv" )
32
+ data ["id" ] = f"http://arxiv.org/abs/{ metadata .find ('{http://arxiv.org/OAI/arXiv/}id' ).text } "
33
+ data ["guidislink" ] = True
34
+ data ["link" ] = data ["id" ]
35
+ data ["published" ] = metadata .find ("{http://arxiv.org/OAI/arXiv/}created" ).text
36
+ data ["updated" ] = metadata .find ("{http://arxiv.org/OAI/arXiv/}updated" )
37
+ if data ["updated" ] is None :
38
+ data ["updated" ] = data ["published" ]
39
+ else :
40
+ data ["updated" ] = data ["updated" ].text
41
+ data ["published_parsed" ] = time .strptime (data ["published" ], "%Y-%m-%d" )
42
+ data ["updated_parsed" ] = time .strptime (data ["updated" ], "%Y-%m-%d" )
43
+ data ["title" ] = metadata .find ("{http://arxiv.org/OAI/arXiv/}title" ).text
44
+ data ["summary" ] = metadata .find ("{http://arxiv.org/OAI/arXiv/}abstract" ).text
45
+ data ["authors" ] = []
46
+
47
+ for author in metadata .find ("{http://arxiv.org/OAI/arXiv/}authors" ):
48
+ if (forename := author .find ("{http://arxiv.org/OAI/arXiv/}forename" )) is None :
49
+ forename = author .find ("{http://arxiv.org/OAI/arXiv/}forenames" )
50
+ if forename is not None :
51
+ name = author .find ("{http://arxiv.org/OAI/arXiv/}keyname" ).text + forename .text
52
+ else :
53
+ name = author .find ("{http://arxiv.org/OAI/arXiv/}keyname" ).text
54
+ data ["authors" ].append ({
55
+ "name" : name
56
+ })
57
+
58
+ data ["author_detail" ] = data ["authors" ][- 1 ]
59
+ data ["author" ] = data ["authors" ][- 1 ]
60
+ data ["links" ] = [{
61
+ "href" : data ["id" ],
62
+ "rel" : "alternate" ,
63
+ "type" : "text/html"
64
+ },{
65
+ "title" : "pdf" ,
66
+ "href" : data ["id" ].replace ("abs" , "pdf" ),
67
+ "rel" : "related" ,
68
+ "type" : "application/pdf"
69
+ }]
70
+ data ["arxiv_primary_category" ] = {
71
+ "term" : metadata .find ("{http://arxiv.org/OAI/arXiv/}categories" ).text .split (" " )[0 ],
72
+ "scheme" : "http://arxiv.org/schemas/atom"
73
+ }
74
+ data ["tags" ] = [
75
+ {
76
+ "term" : cat ,
77
+ "scheme" : "http://arxiv.org/schemas/atom" ,
78
+ "label" : None
79
+ }
80
+ for cat in metadata .find ("{http://arxiv.org/OAI/arXiv/}categories" ).text .split (" " )
81
+ ]
82
+ data ["_idv" ] = metadata .find ("{http://arxiv.org/OAI/arXiv/}id" ).text
83
+ data ["_id" ] = metadata .find ("{http://arxiv.org/OAI/arXiv/}id" ).text
84
+ data ["_version" ] = 1
85
+ data ["_time" ] = time .mktime (data ["updated_parsed" ])
86
+ data ["_time_str" ] = time .strftime ("%b %d %Y" , data ["updated_parsed" ])
87
+
88
+ correct_category = False
89
+ for cat in data ["tags" ]:
90
+ if cat ["term" ] in categories :
91
+ correct_category = True
92
+ break
93
+ if correct_category :
94
+ pid = data ["_id" ]
95
+ if pid in pdb :
96
+ if data ['_time' ] > pdb [pid ]['_time' ]:
97
+ pdb [data ['_id' ]] = data
98
+ mdb [data ['_id' ]] = {'_time' : data ['_time' ]}
99
+ else :
100
+ pdb [data ['_id' ]] = data
101
+ mdb [data ['_id' ]] = {'_time' : data ['_time' ]}
102
+
103
+ return root .find ("{http://www.openarchives.org/OAI/2.0/}ListRecords/{http://www.openarchives.org/OAI/2.0/}resumptionToken" ).text
104
+
105
+
106
+
107
+ if __name__ == "__main__" :
108
+ response = get_response ("cs" )
109
+ while True :
110
+ resumption_token = parse (response )
111
+ print ("Resumption Token" , resumption_token )
112
+ if resumption_token is None :
113
+ break
114
+ time .sleep (5 )
115
+ response = get_response ("cs" , resumption_token )
0 commit comments