Skip to content

community: Add custom sitemap URL parameter to GitbookLoader #30549

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion libs/community/langchain_community/document_loaders/gitbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(
content_selector: str = "main",
continue_on_failure: bool = False,
show_progress: bool = True,
sitemap_url: Optional[str] = None,
):
"""Initialize with web page and whether to load all paths.

Expand All @@ -38,13 +39,20 @@ def __init__(
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
show_progress: whether to show a progress bar while loading. Default: True
sitemap_url: Custom sitemap URL to use when load_all_paths is True.
Defaults to "{base_url}/sitemap.xml".
"""
self.base_url = base_url or web_page
if self.base_url.endswith("/"):
self.base_url = self.base_url[:-1]

if load_all_paths:
# set web_path to the sitemap if we want to crawl all paths
web_page = f"{self.base_url}/sitemap.xml"
if sitemap_url:
web_page = sitemap_url
else:
web_page = f"{self.base_url}/sitemap.xml"

super().__init__(
web_paths=(web_page,),
continue_on_failure=continue_on_failure,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,25 @@ def test_load_multiple_pages(self, web_page: str) -> None:
result = loader.load()
print(len(result)) # noqa: T201
assert len(result) > 10

@pytest.mark.parametrize(
"web_page, sitemap_url, expected_web_path",
[
(
"https://example.com/",
"https://example.com/custom-sitemap.xml",
"https://example.com/custom-sitemap.xml",
),
],
)
def test_init_with_custom_sitemap(
self,
web_page: str,
sitemap_url: str,
expected_web_path: str,
) -> None:
"""Test that the custom sitemap URL is correctly used when provided."""
loader = GitbookLoader(web_page, load_all_paths=True, sitemap_url=sitemap_url)
print(loader.__dict__) # noqa: T201
assert loader.web_path == expected_web_path
assert loader.load_all_paths
193 changes: 193 additions & 0 deletions libs/community/tests/unit_tests/document_loaders/test_gitbook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import unittest
from typing import Any
from unittest.mock import MagicMock, patch

from bs4 import BeautifulSoup

from langchain_community.document_loaders.gitbook import GitbookLoader


class TestGitbookLoader(unittest.TestCase):
def setUp(self) -> None:
# Create mock soup with loc elements for sitemap testing
sitemap_content = """
<urlset>
<url><loc>https://example.com/page1</loc></url>
<url><loc>https://example.com/page2</loc></url>
<url><loc>https://example.com/page3</loc></url>
</urlset>
"""
self.mock_sitemap_soup = BeautifulSoup(sitemap_content, "html.parser")

# Create mock soup for page content
page_content = """
<html>
<body>
<main>
<h1>Test Page</h1>
<p>This is test content.</p>
</main>
</body>
</html>
"""
self.mock_page_soup = BeautifulSoup(page_content, "html.parser")

@patch("langchain_community.document_loaders.web_base.requests.get")
def test_init_with_default_sitemap(self, mock_get: MagicMock) -> None:
# Test that the loader uses the default sitemap URL when load_all_paths=True
loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)

# Check that the web_path was set to the default sitemap URL
self.assertEqual(loader.web_paths[0], "https://example.com/sitemap.xml")

@patch("langchain_community.document_loaders.web_base.requests.get")
def test_init_with_custom_sitemap(self, mock_get: MagicMock) -> None:
# Test that the loader uses the provided sitemap URL when specified
custom_sitemap = "https://example.com/sitemap-pages.xml"
loader = GitbookLoader(
web_page="https://example.com",
load_all_paths=True,
sitemap_url=custom_sitemap,
)

# Check that the web_path was set to the custom sitemap URL
self.assertEqual(loader.web_paths[0], custom_sitemap)

@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all")
def test_lazy_load_with_custom_sitemap(
self, mock_scrape_all: MagicMock, mock_scrape: MagicMock
) -> None:
# Setup the mocks
mock_scrape.return_value = self.mock_sitemap_soup
mock_scrape_all.return_value = [
self.mock_page_soup,
self.mock_page_soup,
self.mock_page_soup,
]

# Create loader with custom sitemap URL
loader = GitbookLoader(
web_page="https://example.com",
load_all_paths=True,
sitemap_url="https://example.com/sitemap-pages.xml",
)

# Get the documents
docs = list(loader.lazy_load())

# Check that we got docs for each path in the sitemap
self.assertEqual(len(docs), 3)
for doc in docs:
self.assertEqual(doc.metadata["title"], "Test Page")
self.assertTrue("This is test content." in doc.page_content)

@patch("langchain_community.document_loaders.web_base.requests.get")
def test_with_single_page(self, mock_get: MagicMock) -> None:
# Test loading a single page (load_all_paths=False)
loader = GitbookLoader(
web_page="https://example.com/page", load_all_paths=False
)

# Check that sitemap URL logic was not applied
self.assertEqual(loader.web_paths[0], "https://example.com/page")

@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
def test_get_paths_extraction(self, mock_scrape: MagicMock) -> None:
# Test that _get_paths correctly extracts paths from sitemap
mock_scrape.return_value = self.mock_sitemap_soup

loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)

soup_info = loader.scrape()
paths = loader._get_paths(soup_info)

# Check that paths were extracted correctly
self.assertEqual(len(paths), 3)
self.assertEqual(paths, ["/page1", "/page2", "/page3"])

@patch("requests.get")
def test_integration_with_different_sitemaps(self, mock_get: MagicMock) -> None:
# This test simulates the reported issue with different sitemap formats

# Mock response for default sitemap (empty content)
empty_resp = MagicMock()
empty_resp.text = "<urlset></urlset>"
empty_resp.status_code = 200

# Mock response for custom sitemap (with content)
custom_resp = MagicMock()
custom_resp.text = """
<urlset>
<url><loc>https://docs.gitbook.com/page1</loc></url>
<url><loc>https://docs.gitbook.com/page2</loc></url>
</urlset>
"""
custom_resp.status_code = 200

# Mock response for the actual pages
page_resp = MagicMock()
page_resp.text = """
<html><body><main><h1>Page</h1><p>Content</p></main></body></html>
"""
page_resp.status_code = 200

# Define side effect to return different responses based on URL
def side_effect(url: str, *args: Any, **kwargs: Any) -> MagicMock:
if url == "https://docs.gitbook.com/sitemap.xml":
return empty_resp
elif url == "https://docs.gitbook.com/sitemap-pages.xml":
return custom_resp
else:
return page_resp

mock_get.side_effect = side_effect

# Test with default sitemap (should result in no docs)
with patch(
"langchain_community.document_loaders.web_base.requests.get",
side_effect=side_effect,
):
with patch(
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
) as mock_scrape:
with patch(
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
) as mock_scrape_all:
mock_scrape.return_value = BeautifulSoup(
"<urlset></urlset>", "html.parser"
)
mock_scrape_all.return_value = []

loader1 = GitbookLoader(
web_page="https://docs.gitbook.com/", load_all_paths=True
)
docs1 = list(loader1.lazy_load())
self.assertEqual(len(docs1), 0)

# Test with custom sitemap (should result in docs)
with patch(
"langchain_community.document_loaders.web_base.requests.get",
side_effect=side_effect,
):
with patch(
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
) as mock_scrape:
with patch(
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
) as mock_scrape_all:
mock_scrape.return_value = BeautifulSoup(
custom_resp.text, "html.parser"
)
mock_scrape_all.return_value = [
BeautifulSoup(page_resp.text, "html.parser"),
BeautifulSoup(page_resp.text, "html.parser"),
]

loader2 = GitbookLoader(
web_page="https://docs.gitbook.com/",
load_all_paths=True,
sitemap_url="https://docs.gitbook.com/sitemap-pages.xml",
)
docs2 = list(loader2.lazy_load())
self.assertEqual(len(docs2), 2)
Loading