Skip to content

Commit 64df60e

Browse files
andrasfeandrasfeeyurtsev
authored
community[minor]: Add custom sitemap URL parameter to GitbookLoader (#30549)
## Description This PR adds a new `sitemap_url` parameter to the `GitbookLoader` class that allows users to specify a custom sitemap URL when loading content from a GitBook site. This is particularly useful for GitBook sites that use non-standard sitemap file names like `sitemap-pages.xml` instead of the default `sitemap.xml`. The standard `GitbookLoader` assumes that the sitemap is located at `/sitemap.xml`, but some GitBook instances (including GitBook's own documentation) use different paths for their sitemaps. This parameter makes the loader more flexible and helps users extract content from a wider range of GitBook sites. ## Issue Fixes bug [30473](#30473) where the `GitbookLoader` would fail to find pages on GitBook sites that use custom sitemap URLs. ## Dependencies No new dependencies required. *I've added*: * Unit tests to verify the parameter works correctly * Integration tests to confirm the parameter is properly used with real GitBook sites * Updated docstrings with parameter documentation The changes are fully backward compatible, as the parameter is optional with a sensible default. --------- Co-authored-by: andrasfe <[email protected]> Co-authored-by: Eugene Yurtsev <[email protected]>
1 parent fdda1aa commit 64df60e

File tree

3 files changed

+235
-1
lines changed

3 files changed

+235
-1
lines changed

libs/community/langchain_community/document_loaders/gitbook.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ def __init__(
2121
content_selector: str = "main",
2222
continue_on_failure: bool = False,
2323
show_progress: bool = True,
24+
*,
25+
sitemap_url: Optional[str] = None,
2426
):
2527
"""Initialize with web page and whether to load all paths.
2628
@@ -38,13 +40,20 @@ def __init__(
3840
exception. Setting this to True makes the loader more robust, but also
3941
may result in missing data. Default: False
4042
show_progress: whether to show a progress bar while loading. Default: True
43+
sitemap_url: Custom sitemap URL to use when load_all_paths is True.
44+
Defaults to "{base_url}/sitemap.xml".
4145
"""
4246
self.base_url = base_url or web_page
4347
if self.base_url.endswith("/"):
4448
self.base_url = self.base_url[:-1]
49+
4550
if load_all_paths:
4651
# set web_path to the sitemap if we want to crawl all paths
47-
web_page = f"{self.base_url}/sitemap.xml"
52+
if sitemap_url:
53+
web_page = sitemap_url
54+
else:
55+
web_page = f"{self.base_url}/sitemap.xml"
56+
4857
super().__init__(
4958
web_paths=(web_page,),
5059
continue_on_failure=continue_on_failure,

libs/community/tests/integration_tests/document_loaders/test_gitbook.py

+21
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,24 @@ def test_load_multiple_pages(self, web_page: str) -> None:
5454
result = loader.load()
5555
print(len(result)) # noqa: T201
5656
assert len(result) > 10
57+
58+
@pytest.mark.parametrize(
59+
"web_page, sitemap_url, expected_web_path",
60+
[
61+
(
62+
"https://example.com/",
63+
"https://example.com/custom-sitemap.xml",
64+
"https://example.com/custom-sitemap.xml",
65+
),
66+
],
67+
)
68+
def test_init_with_custom_sitemap(
69+
self,
70+
web_page: str,
71+
sitemap_url: str,
72+
expected_web_path: str,
73+
) -> None:
74+
"""Test that the custom sitemap URL is correctly used when provided."""
75+
loader = GitbookLoader(web_page, load_all_paths=True, sitemap_url=sitemap_url)
76+
assert loader.web_path == expected_web_path
77+
assert loader.load_all_paths
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
from typing import Any, Tuple
2+
from unittest.mock import MagicMock, patch
3+
4+
import pytest
5+
from bs4 import BeautifulSoup
6+
7+
from langchain_community.document_loaders.gitbook import GitbookLoader
8+
9+
10+
@pytest.fixture
11+
def mock_soups() -> Tuple[BeautifulSoup, BeautifulSoup]:
12+
# Create mock soup with loc elements for sitemap testing
13+
sitemap_content = """
14+
<urlset>
15+
<url><loc>https://example.com/page1</loc></url>
16+
<url><loc>https://example.com/page2</loc></url>
17+
<url><loc>https://example.com/page3</loc></url>
18+
</urlset>
19+
"""
20+
mock_sitemap_soup = BeautifulSoup(sitemap_content, "html.parser")
21+
22+
# Create mock soup for page content
23+
page_content = """
24+
<html>
25+
<body>
26+
<main>
27+
<h1>Test Page</h1>
28+
<p>This is test content.</p>
29+
</main>
30+
</body>
31+
</html>
32+
"""
33+
mock_page_soup = BeautifulSoup(page_content, "html.parser")
34+
return mock_sitemap_soup, mock_page_soup
35+
36+
37+
@patch("langchain_community.document_loaders.web_base.requests.get")
38+
def test_init_with_default_sitemap(mock_get: MagicMock) -> None:
39+
# Test that the loader uses the default sitemap URL when load_all_paths=True
40+
loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)
41+
42+
# Check that the web_path was set to the default sitemap URL
43+
assert loader.web_paths[0] == "https://example.com/sitemap.xml"
44+
45+
46+
@patch("langchain_community.document_loaders.web_base.requests.get")
47+
def test_init_with_custom_sitemap(mock_get: MagicMock) -> None:
48+
# Test that the loader uses the provided sitemap URL when specified
49+
custom_sitemap = "https://example.com/sitemap-pages.xml"
50+
loader = GitbookLoader(
51+
web_page="https://example.com",
52+
load_all_paths=True,
53+
sitemap_url=custom_sitemap,
54+
)
55+
56+
# Check that the web_path was set to the custom sitemap URL
57+
assert loader.web_paths[0] == custom_sitemap
58+
59+
60+
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
61+
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all")
62+
def test_lazy_load_with_custom_sitemap(
63+
mock_scrape_all: MagicMock,
64+
mock_scrape: MagicMock,
65+
mock_soups: Tuple[BeautifulSoup, BeautifulSoup],
66+
) -> None:
67+
# Setup the mocks
68+
mock_sitemap_soup, mock_page_soup = mock_soups
69+
mock_scrape.return_value = mock_sitemap_soup
70+
mock_scrape_all.return_value = [
71+
mock_page_soup,
72+
mock_page_soup,
73+
mock_page_soup,
74+
]
75+
76+
# Create loader with custom sitemap URL
77+
loader = GitbookLoader(
78+
web_page="https://example.com",
79+
load_all_paths=True,
80+
sitemap_url="https://example.com/sitemap-pages.xml",
81+
)
82+
83+
# Get the documents
84+
docs = list(loader.lazy_load())
85+
86+
# Check that we got docs for each path in the sitemap
87+
assert len(docs) == 3
88+
for doc in docs:
89+
assert doc.metadata["title"] == "Test Page"
90+
assert "This is test content." in doc.page_content
91+
92+
93+
@patch("langchain_community.document_loaders.web_base.requests.get")
94+
def test_with_single_page(mock_get: MagicMock) -> None:
95+
# Test loading a single page (load_all_paths=False)
96+
loader = GitbookLoader(web_page="https://example.com/page", load_all_paths=False)
97+
98+
# Check that sitemap URL logic was not applied
99+
assert loader.web_paths[0] == "https://example.com/page"
100+
101+
102+
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
103+
def test_get_paths_extraction(
104+
mock_scrape: MagicMock, mock_soups: Tuple[BeautifulSoup, BeautifulSoup]
105+
) -> None:
106+
# Test that _get_paths correctly extracts paths from sitemap
107+
mock_sitemap_soup, _ = mock_soups
108+
mock_scrape.return_value = mock_sitemap_soup
109+
110+
loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)
111+
112+
soup_info = loader.scrape()
113+
paths = loader._get_paths(soup_info)
114+
115+
# Check that paths were extracted correctly
116+
assert len(paths) == 3
117+
assert paths == ["/page1", "/page2", "/page3"]
118+
119+
120+
@patch("requests.get")
121+
def test_integration_with_different_sitemaps(mock_get: MagicMock) -> None:
122+
# This test simulates the reported issue with different sitemap formats
123+
124+
# Mock response for default sitemap (empty content)
125+
empty_resp = MagicMock()
126+
empty_resp.text = "<urlset></urlset>"
127+
empty_resp.status_code = 200
128+
129+
# Mock response for custom sitemap (with content)
130+
custom_resp = MagicMock()
131+
custom_resp.text = """
132+
<urlset>
133+
<url><loc>https://docs.gitbook.com/page1</loc></url>
134+
<url><loc>https://docs.gitbook.com/page2</loc></url>
135+
</urlset>
136+
"""
137+
custom_resp.status_code = 200
138+
139+
# Mock response for the actual pages
140+
page_resp = MagicMock()
141+
page_resp.text = """
142+
<html><body><main><h1>Page</h1><p>Content</p></main></body></html>
143+
"""
144+
page_resp.status_code = 200
145+
146+
# Define side effect to return different responses based on URL
147+
def side_effect(url: str, *args: Any, **kwargs: Any) -> MagicMock:
148+
if url == "https://docs.gitbook.com/sitemap.xml":
149+
return empty_resp
150+
elif url == "https://docs.gitbook.com/sitemap-pages.xml":
151+
return custom_resp
152+
else:
153+
return page_resp
154+
155+
mock_get.side_effect = side_effect
156+
157+
# Test with default sitemap (should result in no docs)
158+
with patch(
159+
"langchain_community.document_loaders.web_base.requests.get",
160+
side_effect=side_effect,
161+
):
162+
with patch(
163+
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
164+
) as mock_scrape:
165+
with patch(
166+
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
167+
) as mock_scrape_all:
168+
mock_scrape.return_value = BeautifulSoup(
169+
"<urlset></urlset>", "html.parser"
170+
)
171+
mock_scrape_all.return_value = []
172+
173+
loader1 = GitbookLoader(
174+
web_page="https://docs.gitbook.com/", load_all_paths=True
175+
)
176+
docs1 = list(loader1.lazy_load())
177+
assert len(docs1) == 0
178+
179+
# Test with custom sitemap (should result in docs)
180+
with patch(
181+
"langchain_community.document_loaders.web_base.requests.get",
182+
side_effect=side_effect,
183+
):
184+
with patch(
185+
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
186+
) as mock_scrape:
187+
with patch(
188+
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
189+
) as mock_scrape_all:
190+
mock_scrape.return_value = BeautifulSoup(
191+
custom_resp.text, "html.parser"
192+
)
193+
mock_scrape_all.return_value = [
194+
BeautifulSoup(page_resp.text, "html.parser"),
195+
BeautifulSoup(page_resp.text, "html.parser"),
196+
]
197+
198+
loader2 = GitbookLoader(
199+
web_page="https://docs.gitbook.com/",
200+
load_all_paths=True,
201+
sitemap_url="https://docs.gitbook.com/sitemap-pages.xml",
202+
)
203+
docs2 = list(loader2.lazy_load())
204+
assert len(docs2) == 2

0 commit comments

Comments
 (0)