-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dataset_with_splits.py
76 lines (55 loc) · 3.11 KB
/
create_dataset_with_splits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os, sys, time
import re #, json
import yaml
import argparse
import wordplay
from wordplay import Problem
import wordplay.generic
#print(wordplay.config.keys())
sites_available = [k for k in wordplay.config.keys() if k!='test']
parser = argparse.ArgumentParser()
parser.add_argument("--author", type=str, required=True, help="Author 'tag' in the site URL for their pages")
parser.add_argument("--site", type=str, required=True, choices=sites_available, help="One of the available sites")
parser.add_argument("--pages", type=int, default=3, help="Number of pages to download (-1 for 'ALL')")
args = parser.parse_args() # exits here on parse failure
#print("PARSE SUCCESS")
# --author teacow --site fifteensquared = Financial Times (Very nice wordplay)
# --author pipkirby --site timesforthetimes = Times Daily Cryptic - clean wordplay
# --author chris-woods --site timesforthetimes = Times Quick Cryptic - clean wordplay ... But still a lot of non-parses
## --author jackkt --site timesforthetimes = Times Quick Cryptic - clean wordplay (some color changes)
## --author kitty --site timesforthetimes = Times Quick Cryptic - clean wordplay (but difficult to parse)
## --author curarist --site timesforthetimes = Times Quick Cryptic - cleanish wordplay (some color changes)
## --author rolytoly --site timesforthetimes = Times Quick Cryptic - cleanish wordplay (some long explanations)
# eileen@fifteensquared = Guardian (looks good)
## Not so great for dataset building:
# davemulligangmail-com@timesforthetimes = Times Quick Cryptic (Uses 'strong' in wordplay)
# pipkirby@timesforthetimes = Times Daily Cryptic (But uses colours in the clues, rather than pure wordplay)
# Telegraph is on BigDave44.com ... tend to be wordy, rather than annotations (quick sample)
author, site = args.author, wordplay.config[args.site]
page_limit = args.pages
print("Downloading the author's index pages")
wordplay.get_all_author_index_pages(site, author=author)
print("Extracting the individual page URLs")
url_arr=wordplay.extract_individual_page_urls_for_author(site, author=author)
print(f" Found {len(url_arr)} individual page URLs")
if page_limit>0:
print(f"Downloading {page_limit} individual HTML pages")
url_arr = url_arr[:page_limit]
else:
print(f"Downloading ALL listed individual HTML pages")
wordplay.ensure_pages_downloaded(url_arr, site, author=author)
print(f"Processing {len(url_arr)} HTML pages")
for url in url_arr:
#fname_stub = wordplay.url_to_fname_stub(url)
#print(f" Processing Page '{fname_stub}'")
#soup = wordplay.get_content_from(site, fname_stub, author=author)
#content=soup.select_one(site['css_content'])
#
#clue_starts=wordplay.generic.get_most_important_node_arr(content, debug=True)
#print(f" Found {len(clue_starts)} potential clue-starts")
#
#problem_list = wordplay.generic.build_problem_arr(clue_starts, content.next_sibling, debug=True)
#print(f" Found {len(problem_list)} Clues ... testing validity")
wordplay.create_yaml_from_url(site, url, author=author, use_custom=False)
print("Gathering pages into author jsonl files (with splits)")
wordplay.gather_data_for_author(site, author=author)