Skip to content

Commit c2a1f6e

Browse files
WIP function page parser
1 parent abf5833 commit c2a1f6e

File tree

1 file changed

+136
-28
lines changed

1 file changed

+136
-28
lines changed

migrate/oldwiki/scrape.py

Lines changed: 136 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,26 @@ def parse_description(content_div):
293293

294294
return the_description
295295

296+
def parse_issues(content_div):
297+
issues = []
298+
for a in content_div.find_all("a", href=True):
299+
# Ignore if parent div has class "note-messagebox"
300+
if "note-messagebox" in a.find_parent("div").get("class", []):
301+
continue
302+
href = a["href"]
303+
if "github.com/multitheftauto/mtasa-blue/issues/" in href:
304+
issue_number = href.split("/")[-1]
305+
# Find the next td, that is the description
306+
issue_desc = "TODO"
307+
next_td = a.find_next("td")
308+
if next_td:
309+
issue_desc = next_td.get_text(strip=True)
310+
issues.append({
311+
"id": issue_number,
312+
"description": issue_desc
313+
})
314+
return issues
315+
296316
def get_page_from_cache_or_fetch(page_url: str, page_name: str) -> str:
297317
"""Get the page content from cache or fetch it if not cached."""
298318
cache_file = os.path.join(PAGES_CACHE_DIR, f"{page_name}.html")
@@ -309,6 +329,31 @@ def get_page_from_cache_or_fetch(page_url: str, page_name: str) -> str:
309329
else:
310330
raise ValueError(f"Failed to fetch {page_url}: {response.status_code}")
311331

332+
def print_additional_headers_found_in_page(content_div, handled_header_names, page_url):
333+
"""Print any additional headers found in the content_div that were not handled."""
334+
additional_headers = []
335+
# Ignore headers from see also
336+
IGNORE_WORDS = [
337+
"see also", "events", "functions", "changelog",
338+
"game processing order", "input", "gui",
339+
"browsers", "buttons", "checkboxes", "comboboxes",
340+
"edit boxes", "gridlists", "memos", "progressbars", "radio buttons",
341+
"scrollbars", "scrollpanes", "static images", "tab Panels", "tabs",
342+
"tab panels", "text labels", "windows"
343+
]
344+
for header in content_div.find_all(["h2", "h3"]):
345+
header_text = header.get_text(strip=True)
346+
if header_text and header_text not in handled_header_names:
347+
header_text_lower = header_text.lower()
348+
# Ignore some headers that are not relevant
349+
if any(ignore_word in header_text_lower for ignore_word in IGNORE_WORDS):
350+
continue
351+
additional_headers.append(header_text)
352+
353+
if additional_headers:
354+
print(f"Other headers found in {page_url}:")
355+
print(f" {', '.join(additional_headers)}")
356+
312357
def parse_event_page(page_url: str, category: str, name: str, source: str) -> dict:
313358
response_text = get_page_from_cache_or_fetch(page_url, name)
314359

@@ -318,6 +363,10 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
318363
content_div = soup.find("div", id="mw-content-text")
319364
if not content_div:
320365
raise ValueError(f"Could not find content in {page_url}")
366+
367+
stop_if_deprecated(content_div, page_url)
368+
369+
handled_header_names = []
321370

322371
event_type = "client" if "Client" in source else "server"
323372

@@ -330,6 +379,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
330379
parameters_header = content_div.find("span", id="Parameters")
331380

332381
if parameters_header:
382+
handled_header_names.append("Parameters")
333383
params = []
334384
next_element = parameters_header.find_next()
335385

@@ -393,6 +443,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
393443
event_source = None
394444
source_header = content_div.find("span", id="Source")
395445
if source_header:
446+
handled_header_names.append("Source")
396447
source_paragraph = source_header.find_next("p")
397448
if source_paragraph:
398449
source_text = source_paragraph.get_text().strip()
@@ -409,6 +460,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
409460
event_canceling = None
410461
canceling_header = content_div.find("span", id="Canceling") or content_div.find("span", id="Cancelling") or content_div.find("span", id="Cancel_effect") or content_div.find("span", id="Cancel_effects") or content_div.find("span", id="Cancel_Effect") or content_div.find("span", id="Cancel_Effects")
411462
if canceling_header:
463+
handled_header_names.append(canceling_header.text.strip())
412464
# Extract text
413465
canceling_paragraph = canceling_header.find_next("p")
414466
if canceling_paragraph:
@@ -419,11 +471,11 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
419471

420472
# Examples
421473
examples = parse_examples(content_div)
474+
handled_header_names.append("Examples")
475+
handled_header_names.append("Example")
422476
if len(examples) == 0:
423477
print(f"Event is missing code examples: {page_url}")
424478

425-
# For each example, create a .lua file with the code
426-
# with name eventName-index.lua
427479
example_index = 1
428480
added_examples = []
429481
for example in examples:
@@ -444,6 +496,35 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
444496

445497
event_notes, event_meta = parse_notes(content_div)
446498

499+
# Parse Type section, put it into a note
500+
type_header = content_div.find("span", id="Type")
501+
if type_header:
502+
type_paragraph = type_header.find_next("p")
503+
if type_paragraph:
504+
type_text = type_paragraph.get_text().strip()
505+
if type_text:
506+
# Remove new lines from the type text
507+
type_text = type_text.replace("\n", " ")
508+
# Look for any list after that paragraph
509+
list_items = type_paragraph.find_next("ul")
510+
if list_items:
511+
prev_header = type_paragraph.find_previous("h2") or type_paragraph.find_previous("h3")
512+
if prev_header and prev_header.getText(strip=True) == "Type":
513+
# If the header is "Type", we can safely add the list items to the type text
514+
type_text += " " + ", ".join(li.get_text(strip=True) for li in list_items.find_all("li"))
515+
516+
event_notes.append({
517+
"type": "info",
518+
"content": type_text
519+
})
520+
handled_header_names.append("Type")
521+
522+
# Parse Issues
523+
event_issues = parse_issues(content_div)
524+
handled_header_names.append("Issues")
525+
526+
print_additional_headers_found_in_page(content_div, handled_header_names, page_url)
527+
447528
yaml_dict = {
448529
"name": name,
449530
"type": event_type,
@@ -458,38 +539,72 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
458539
yaml_dict["notes"] = event_notes
459540
if event_meta:
460541
yaml_dict["meta"] = event_meta
542+
if event_issues:
543+
yaml_dict["issues"] = event_issues
461544

462545
# Set incomplete to true if no description is found for at least one parameter
463546
if any(param["description"] == "MISSING_PARAM_DESC" for param in event_parameters):
464547
yaml_dict["incomplete"] = True
465548

466549
return yaml_dict
467550

551+
def stop_if_deprecated(content_div, page_url: str):
552+
deprecated_texts = [
553+
"This function is deprecated",
554+
"Function has been disabled",
555+
"This function is provided by the external",
556+
"This page is marked for deletion"
557+
# "BEFORE VERSION",
558+
]
559+
for text in deprecated_texts:
560+
if content_div.find(string=lambda s: s and text in s):
561+
raise ValueError(f"Found {text} in {page_url}. Please review manually.")
562+
468563
def parse_function_page(page_url: str, category: str, name: str, source: str) -> dict:
469564
response_text = get_page_from_cache_or_fetch(page_url, name)
470565

471566
soup = BeautifulSoup(response_text, "html.parser")
472567
content_div = soup.find("div", id="mw-content-text")
473568
if not content_div:
474569
raise ValueError(f"Could not find content in {page_url}")
570+
571+
stop_if_deprecated(content_div, page_url)
572+
573+
handled_header_names = []
475574

476575
func_type = "shared" if "Shared" in source else "server" if "Server" in source else "client"
477576

478577
func_description = parse_description(content_div)
479578
if func_description is None:
480579
raise ValueError(f"Could not find a valid description for {name} in {page_url}")
481580

482-
func_notes, func_meta = parse_notes(content_div)
581+
func_pair = None
582+
counterpart_b = content_div.find("b", string="Counterpart")
583+
if counterpart_b:
584+
i_tag = counterpart_b.find_next("i")
585+
if i_tag and i_tag.a:
586+
func_pair = i_tag.a.text.strip()
483587

588+
func_notes, func_meta = parse_notes(content_div)
589+
590+
# Syntax: parameters and returns TODO
591+
handled_header_names.append("Syntax")
592+
handled_header_names.append("Parameters")
593+
handled_header_names.append("Arguments")
594+
handled_header_names.append("Required Arguments")
595+
handled_header_names.append("Required arguments")
596+
handled_header_names.append("Optional Arguments")
597+
handled_header_names.append("Optional arguments")
598+
handled_header_names.append("Returns")
599+
484600

485601
# Examples
486602
examples = parse_examples(content_div)
603+
handled_header_names.append("Examples")
604+
handled_header_names.append("Example")
487605
# if len(examples) == 0:
488606
# print(f"Function is missing code examples: {page_url}")
489607

490-
491-
# For each example, create a .lua file with the code
492-
# with name eventName-index.lua
493608
example_index = 1
494609
added_examples = []
495610
for example in examples:
@@ -508,35 +623,28 @@ def parse_function_page(page_url: str, category: str, name: str, source: str) ->
508623
})
509624
example_index += 1
510625

626+
# Parse Issues
627+
func_issues = parse_issues(content_div)
628+
handled_header_names.append("Issues")
629+
630+
print_additional_headers_found_in_page(content_div, handled_header_names, page_url)
511631

512632
yaml_dict = {
513633
func_type: {
514634
"name": name,
515635
"description": func_description,
516636
"parameters": [],
517637
"examples": added_examples,
518-
"notes": func_notes,
519-
"meta": func_meta
520638
}
521639
}
522-
523-
# if source.startswith("Shared"):
524-
# yaml_content = "shared: &shared\n"
525-
# yaml_content += f" incomplete: true\n"
526-
# yaml_content += f" name: {name}\n"
527-
# yaml_content += f" description: TODO\n"
528-
# yaml_content += "\nserver:\n <<: *shared"
529-
# yaml_content += "\nclient:\n <<: *shared"
530-
# elif source.startswith("Server"):
531-
# yaml_content = "server:\n"
532-
# yaml_content += f" incomplete: true\n"
533-
# yaml_content += f" name: {name}\n"
534-
# yaml_content += f" description: TODO\n"
535-
# elif source.startswith("Client"):
536-
# yaml_content = "client:\n"
537-
# yaml_content += f" incomplete: true\n"
538-
# yaml_content += f" name: {name}\n"
539-
# yaml_content += f" description: TODO\n"
640+
if func_pair:
641+
yaml_dict[func_type]["pair"] = func_pair
642+
if func_notes:
643+
yaml_dict[func_type]["notes"] = func_notes
644+
if func_meta:
645+
yaml_dict[func_type]["meta"] = func_meta
646+
if func_issues:
647+
yaml_dict[func_type]["issues"] = func_issues
540648

541649
return yaml_dict
542650

@@ -599,8 +707,8 @@ def main():
599707

600708
# TEST Parse only these:
601709
# functions_by_source["Shared functions"] = {
602-
# "Element": [
603-
# ("https://wiki.multitheftauto.com/wiki/SetElementParent", "setElementParent"),
710+
# "Player": [
711+
# ("https://wiki.multitheftauto.com/wiki/SetPlayerName", "setPlayerName"),
604712
# ]
605713
# }
606714

0 commit comments

Comments
 (0)