1+ # TODO: this is pseudocode and may not run or may not run correctly
2+
3+ import pandas as pd
4+
5+ def fetch_report_urls (season ):
6+ """Get all report URLs from a season's report index page"""
7+ pass
8+
9+ # TODO: consider how to encode a "season" object, maybe as a tuple of start/end years `(2023, 2024)`, or a string `2023-2024`.
10+ def fetch_one_season_from_report (season ):
11+ report_urls = fetch_report_urls (season )
12+ df_list = [fetch_one_report (url ) for url in report_urls ]
13+ df = pd .concat (df_list )
14+
15+ return df
16+
17+ def fetch_one_dashboard (url = None ):
18+ """Get data from current or archived dashboard"""
19+ # If no url is provided, fetch data from the current dashboard (whose URL is static).
20+ if not url :
21+ url = DEFAULT_DASHBOARD_URL
22+
23+ # TODO: put rest of scraping code in here
24+ pass
25+
26+ def fetch_report_data (start_date , end_date ):
27+ included_seasons = compute_seasons_in_range (start_date , end_date )
28+
29+ # Fetch all reports made for each season.
30+ # We do this because fetching reports is pretty fast, and it saves us from
31+ # having to parse either URLs or text on the webpage. We will drop data
32+ # outside the requested range later.
33+ df_list = [fetch_one_season_from_report (season ) for season in included_seasons ]
34+ df = pd .concat (df_list )
35+
36+ # Only keep data that was issued within the requested date range.
37+ df = df [start_date <= df .issue <= end_date ]
38+
39+ return df
40+
41+ def fetch_historical_dashboard_data (start_date , end_date ):
42+ included_weeks = compute_weeks_in_range (start_date , end_date )
43+ included_report_urls = construct_archived_dashboard_urls (included_weeks )
44+
45+ df_list = [fetch_one_dashboard (url ) for url in included_report_urls ]
46+ df = pd .concat (df_list )
47+
48+ return df
49+
50+ def fetch_historical_dashboard_data (start_date , end_date ):
51+ create all historical_dashboard_urls included in date range
52+ loop over urls :
53+ fetch_dashboard_data (historical_dashboard_url )
54+
55+ included_seasons = compute_seasons_in_range (start_date , end_date )
56+ df_list = [fetch_one_season_from_report (season ) for season in included_seasons ]
57+ df = pd .concat (df_list )
58+ df = df [start_date <= df .issue <= end_date ]
59+
60+ return df
61+
62+ def fetch_current_dashboard_data ():
63+ fetch_dashboard_data (current_dashboard_url )
64+
65+ def fetch_data (start_date , end_date ):
66+ if (start_date , end_date ) not exist :
67+ data = fetch_current_dashboard_data ()
68+ else :
69+ early_range , late_range = split_date_range_by_dashboard_release_date (start_date , end_date )
70+ report_data = fetch_report_data (early_range )
71+ dashboard_data = fetch_historical_dashboard_data (late_range )
72+
73+ data = [report_data , dashboard_data ].concat ()
74+
75+ return data
0 commit comments