Skip to content

Commit 7ff6c0e

Browse files
authored
EmbedAPI: main content based on documentation tool (#11812)
We need different CSS selectors to find the main content node depending on the documentation tool used. This PR adds the `?selector=` attribute to the EmbedAPI to accept a specific selector for the main content of the page. This attribute will be sent by the Addons frontend code using the heuristic code we wrote there. If this specific selector is not sent, we fallback to the default selector's value. ### Using the _default selector_ ![Screenshot_2024-11-28_16-04-58](https://github.com/user-attachments/assets/992d2f83-97c8-4860-9950-edcb4e7e2fb2) ### Using the specific selector for this documentation tool ![Screenshot_2024-11-28_16-05-48](https://github.com/user-attachments/assets/db88e3db-d6a1-4943-b0b3-61c1ef1c78fa)
1 parent c20603e commit 7ff6c0e

File tree

4 files changed

+65
-8
lines changed

4 files changed

+65
-8
lines changed

docs/user/api/v3.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2001,10 +2001,11 @@ Embed
20012001
:query string url: full URL of the document (with optional fragment) to fetch content from.
20022002
:query string doctool: *optional* documentation tool key name used to generate the target documentation (currently, only ``sphinx`` is accepted)
20032003
:query string doctoolversion: *optional* documentation tool version used to generate the target documentation (e.g. ``4.2.0``).
2004+
:query string maincontent: *optional* CSS selector for the main content of the page (e.g. ``div#main``).
20042005

20052006
.. note::
20062007

2007-
Passing ``?doctool=`` and ``?doctoolversion=`` may improve the response,
2008+
Passing ``?doctool=``, ``?doctoolversion=`` and ``?maincontent=`` may improve the response
20082009
since the endpoint will know more about the exact structure of the HTML and can make better decisions.
20092010

20102011
Additional APIs

docs/user/guides/embedding-content.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ from our own docs and will populate the content of it into the ``#help-container
6363
'url': 'https://docs.readthedocs.io/en/latest/automation-rules.html%23creating-an-automation-rule',
6464
// 'doctool': 'sphinx',
6565
// 'doctoolversion': '4.2.0',
66+
// 'maincontent': 'div#main',
6667
};
6768
var url = 'https://readthedocs.org/api/v3/embed/?' + $.param(params);
6869
$.get(url, function(data) {

readthedocs/embed/v3/tests/test_external_pages.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,32 @@ def test_default_main_section(self, app, client, requests_mock):
8686
}
8787
compare_content_without_blank_lines(json_response["content"], content)
8888

89+
@pytest.mark.sphinx("html", srcdir=srcdir, freshenv=True)
90+
def test_specific_main_content_selector(self, app, client, requests_mock):
91+
app.build()
92+
path = app.outdir / "index.html"
93+
assert path.exists() is True
94+
content = open(path).read()
95+
requests_mock.get("https://docs.project.com", text=content)
96+
97+
params = {
98+
"url": "https://docs.project.com",
99+
"maincontent": "#invalid-selector",
100+
}
101+
response = client.get(self.api_url, params)
102+
assert response.status_code == 404
103+
104+
params = {
105+
"url": "https://docs.project.com",
106+
"maincontent": "section",
107+
}
108+
response = client.get(self.api_url, params)
109+
assert response.status_code == 200
110+
# Check the main three sections are returned.
111+
assert "Title" in response.json()["content"]
112+
assert "Sub-title" in response.json()["content"]
113+
assert "Manual Reference Section" in response.json()["content"]
114+
89115
@pytest.mark.sphinx("html", srcdir=srcdir, freshenv=True)
90116
def test_specific_identifier(self, app, client, requests_mock):
91117
app.build()

readthedocs/embed/v3/views.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class EmbedAPIBase(EmbedAPIMixin, CDNCacheTagsMixin, APIView):
5050
* url (with fragment) (required)
5151
* doctool
5252
* doctoolversion
53+
* maincontent
5354
5455
### Example
5556
@@ -121,7 +122,14 @@ def _get_page_content_from_storage(self, project, version, filename):
121122

122123
return None
123124

124-
def _get_content_by_fragment(self, url, fragment, doctool, doctoolversion):
125+
def _get_content_by_fragment(
126+
self,
127+
url,
128+
fragment,
129+
doctool,
130+
doctoolversion,
131+
selector,
132+
):
125133
if self.external:
126134
page_content = self._download_page_content(url)
127135
else:
@@ -133,10 +141,17 @@ def _get_content_by_fragment(self, url, fragment, doctool, doctoolversion):
133141
)
134142

135143
return self._parse_based_on_doctool(
136-
page_content, fragment, doctool, doctoolversion
144+
page_content,
145+
fragment,
146+
doctool,
147+
doctoolversion,
148+
selector,
137149
)
138150

139-
def _find_main_node(self, html):
151+
def _find_main_node(self, html, selector):
152+
if selector:
153+
return html.css_first(selector)
154+
140155
main_node = html.css_first("[role=main]")
141156
if main_node:
142157
log.debug("Main node found. selector=[role=main]")
@@ -152,7 +167,14 @@ def _find_main_node(self, html):
152167
log.debug("Main node found. selector=h1")
153168
return first_header.parent
154169

155-
def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversion):
170+
def _parse_based_on_doctool(
171+
self,
172+
page_content,
173+
fragment,
174+
doctool,
175+
doctoolversion,
176+
selector,
177+
):
156178
# pylint: disable=unused-argument disable=too-many-nested-blocks
157179
if not page_content:
158180
return
@@ -167,7 +189,7 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
167189
node = HTMLParser(page_content).css_first(selector)
168190
else:
169191
html = HTMLParser(page_content)
170-
node = self._find_main_node(html)
192+
node = self._find_main_node(html, selector)
171193

172194
if not node:
173195
return
@@ -284,6 +306,7 @@ def get(self, request): # noqa
284306
url = request.GET.get("url")
285307
doctool = request.GET.get("doctool")
286308
doctoolversion = request.GET.get("doctoolversion")
309+
selector = request.GET.get("maincontent")
287310

288311
if not url:
289312
return Response(
@@ -338,6 +361,7 @@ def get(self, request): # noqa
338361
fragment,
339362
doctool,
340363
doctoolversion,
364+
selector,
341365
)
342366
except requests.exceptions.TooManyRedirects:
343367
log.exception("Too many redirects.", url=url)
@@ -365,12 +389,17 @@ def get(self, request): # noqa
365389
)
366390

367391
if not content_requested:
368-
log.warning("Identifier not found.", url=url, fragment=fragment)
392+
log.warning(
393+
"Identifier not found.",
394+
url=url,
395+
fragment=fragment,
396+
maincontent=selector,
397+
)
369398
return Response(
370399
{
371400
"error": (
372401
"Can't find content for section: "
373-
f"url={url} fragment={fragment}"
402+
f"url={url} fragment={fragment} maincontent={selector}"
374403
)
375404
},
376405
status=status.HTTP_404_NOT_FOUND,

0 commit comments

Comments
 (0)