Skip to content

Commit abf5833

Browse files
Add caching of func and event pages to scraper
1 parent 9f6faeb commit abf5833

File tree

2 files changed

+81
-16
lines changed

2 files changed

+81
-16
lines changed

migrate/oldwiki/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
output/
2+
cache/
23
__pycache__/

migrate/oldwiki/scrape.py

Lines changed: 80 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,40 @@
1+
# Comments:
2+
# - Running without skipping cache for the 1st time will take a while, but subsequent runs will be much faster
3+
# - Parsing functions takes a lot longer than events because there's a lot more
4+
15
import requests
26
from bs4 import BeautifulSoup
37
from html_to_markdown import convert_to_markdown
48
import yaml
59

10+
import time
611
import os
712
import shutil
813

9-
# 🌐 URL constants
14+
# Cache of event/function Wiki pages
15+
SKIP_CACHE = False # Set to True to skip cache and always fetch fresh pages
16+
PAGES_CACHE_DIR = "./cache/pages"
17+
18+
# Function listings URLs
1019
URL_CLIENT_FUNCS = "https://wiki.multitheftauto.com/wiki/Client_Scripting_Functions"
1120
URL_SERVER_FUNCS = "https://wiki.multitheftauto.com/wiki/Server_Scripting_Functions"
1221
URL_SHARED_FUNCS = "https://wiki.multitheftauto.com/wiki/Shared_Scripting_Functions"
1322

23+
# Event listings URLs
1424
URL_CLIENT_EVENTS = "https://wiki.multitheftauto.com/wiki/Client_Scripting_Events"
1525
URL_SERVER_EVENTS = "https://wiki.multitheftauto.com/wiki/Server_Scripting_Events"
1626

27+
# Output directories
1728
FUNCTIONS_DIR = "./output/functions"
1829
EVENTS_DIR = "./output/events"
1930

31+
# Rename some categories
2032
CATEGORY_CORRECTIONS = {
2133
'SQL': 'Database',
2234
'Collision_shape': 'Colshape',
2335
}
2436

37+
# Don't include these items from the listings
2538
NAME_BLACKLIST = [
2639
'Matrix',
2740
'Vector'
@@ -41,7 +54,7 @@ def fix_category(category_name: str) -> str:
4154
return category_name
4255

4356
def parse_links(source_label: str, url: str) -> dict:
44-
print(f"Parsing list of {source_label} from {url}...")
57+
print(f"Parsing list of {source_label} ...")
4558

4659
response = requests.get(url)
4760
soup = BeautifulSoup(response.text, "html.parser")
@@ -279,11 +292,27 @@ def parse_description(content_div):
279292
break
280293

281294
return the_description
282-
295+
296+
def get_page_from_cache_or_fetch(page_url: str, page_name: str) -> str:
297+
"""Get the page content from cache or fetch it if not cached."""
298+
cache_file = os.path.join(PAGES_CACHE_DIR, f"{page_name}.html")
299+
if (not SKIP_CACHE) and os.path.exists(cache_file):
300+
with open(cache_file, "r", encoding="utf-8") as f:
301+
return f.read()
302+
else:
303+
# Fetch and cache the page
304+
response = requests.get(page_url)
305+
if response.status_code == 200:
306+
with open(cache_file, "w", encoding="utf-8") as f:
307+
f.write(response.text)
308+
return response.text
309+
else:
310+
raise ValueError(f"Failed to fetch {page_url}: {response.status_code}")
283311

284312
def parse_event_page(page_url: str, category: str, name: str, source: str) -> dict:
285-
response = requests.get(page_url)
286-
soup = BeautifulSoup(response.text, "html.parser")
313+
response_text = get_page_from_cache_or_fetch(page_url, name)
314+
315+
soup = BeautifulSoup(response_text, "html.parser")
287316

288317
# Find first non-empty p inside mw-content-text
289318
content_div = soup.find("div", id="mw-content-text")
@@ -391,7 +420,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
391420
# Examples
392421
examples = parse_examples(content_div)
393422
if len(examples) == 0:
394-
print(f"Found no examples for {name}")
423+
print(f"Event is missing code examples: {page_url}")
395424

396425
# For each example, create a .lua file with the code
397426
# with name eventName-index.lua
@@ -409,7 +438,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
409438
added_examples.append({
410439
"path": 'examples/' + example_filename,
411440
"description": example_description,
412-
"side": example.get("type") or event_type # Default to event_type if not specified
441+
"side": example.get("type") or event_type # Default to this if not specified
413442
})
414443
example_index += 1
415444

@@ -437,8 +466,9 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
437466
return yaml_dict
438467

439468
def parse_function_page(page_url: str, category: str, name: str, source: str) -> dict:
440-
response = requests.get(page_url)
441-
soup = BeautifulSoup(response.text, "html.parser")
469+
response_text = get_page_from_cache_or_fetch(page_url, name)
470+
471+
soup = BeautifulSoup(response_text, "html.parser")
442472
content_div = soup.find("div", id="mw-content-text")
443473
if not content_div:
444474
raise ValueError(f"Could not find content in {page_url}")
@@ -450,13 +480,41 @@ def parse_function_page(page_url: str, category: str, name: str, source: str) ->
450480
raise ValueError(f"Could not find a valid description for {name} in {page_url}")
451481

452482
func_notes, func_meta = parse_notes(content_div)
483+
484+
485+
# Examples
486+
examples = parse_examples(content_div)
487+
# if len(examples) == 0:
488+
# print(f"Function is missing code examples: {page_url}")
489+
490+
491+
# For each example, create a .lua file with the code
492+
# with name eventName-index.lua
493+
example_index = 1
494+
added_examples = []
495+
for example in examples:
496+
example_code = example.get("code", "").strip()
497+
if example_code:
498+
example_filename = f"{name}-{example_index}.lua"
499+
example_path = os.path.join(FUNCTIONS_DIR, category, 'examples', example_filename)
500+
os.makedirs(os.path.dirname(example_path), exist_ok=True)
501+
with open(example_path, "w", encoding="utf-8") as example_file:
502+
example_file.write(example_code)
503+
example_description = example.get("description", "").strip()
504+
added_examples.append({
505+
"path": 'examples/' + example_filename,
506+
"description": example_description,
507+
"side": example.get("type") or func_type # Default to this if not specified
508+
})
509+
example_index += 1
510+
453511

454512
yaml_dict = {
455513
func_type: {
456514
"name": name,
457515
"description": func_description,
458516
"parameters": [],
459-
"examples": [],
517+
"examples": added_examples,
460518
"notes": func_notes,
461519
"meta": func_meta
462520
}
@@ -503,6 +561,7 @@ def convert_page_to_yaml(page_url: str, category: str, name: str, source: str) -
503561

504562
def parse_items_by_source(base_dir, data_by_source):
505563
for source, categories in data_by_source.items():
564+
started_at = time.time()
506565
print(f"Parsing individual pages of {source}...")
507566
for category, entries in categories.items():
508567
dir_path = os.path.join(base_dir, category)
@@ -522,16 +581,21 @@ def parse_items_by_source(base_dir, data_by_source):
522581
if os.path.exists(filename):
523582
os.remove(filename)
524583

525-
print(f"YAML & Lua files for {source} written successfully to {base_dir}.")
584+
print(f">> Parsed individual pages of {source} in {time.time() - started_at:.2f} seconds.")
526585

527586
def main():
587+
# Create cache directory if it doesn't exist
588+
if not os.path.exists(PAGES_CACHE_DIR):
589+
os.makedirs(PAGES_CACHE_DIR)
590+
print("SKIP_CACHE is set to", SKIP_CACHE)
591+
528592
functions_by_source = {}
529593
events_by_source = {}
530594

531595
# Functions
532-
# functions_by_source["Shared functions"] = parse_links("Shared functions", URL_SHARED_FUNCS)
533-
# functions_by_source["Client functions"] = parse_links("Client functions", URL_CLIENT_FUNCS)
534-
# functions_by_source["Server functions"] = parse_links("Server functions", URL_SERVER_FUNCS)
596+
functions_by_source["Shared functions"] = parse_links("Shared functions", URL_SHARED_FUNCS)
597+
functions_by_source["Client functions"] = parse_links("Client functions", URL_CLIENT_FUNCS)
598+
functions_by_source["Server functions"] = parse_links("Server functions", URL_SERVER_FUNCS)
535599

536600
# TEST Parse only these:
537601
# functions_by_source["Shared functions"] = {
@@ -541,8 +605,8 @@ def main():
541605
# }
542606

543607
# Events
544-
events_by_source["Client events"] = parse_links("Client events", URL_CLIENT_EVENTS)
545-
events_by_source["Server events"] = parse_links("Server events", URL_SERVER_EVENTS)
608+
# events_by_source["Client events"] = parse_links("Client events", URL_CLIENT_EVENTS)
609+
# events_by_source["Server events"] = parse_links("Server events", URL_SERVER_EVENTS)
546610

547611
# Empty output directory
548612
if os.path.exists("./output"):

0 commit comments

Comments
 (0)