diff --git a/libs/community/langchain_community/document_loaders/gitbook.py b/libs/community/langchain_community/document_loaders/gitbook.py
index 816d68c33f610..d74b4cd4763fb 100644
--- a/libs/community/langchain_community/document_loaders/gitbook.py
+++ b/libs/community/langchain_community/document_loaders/gitbook.py
@@ -21,6 +21,8 @@ def __init__(
content_selector: str = "main",
continue_on_failure: bool = False,
show_progress: bool = True,
+ *,
+ sitemap_url: Optional[str] = None,
):
"""Initialize with web page and whether to load all paths.
@@ -38,13 +40,20 @@ def __init__(
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
show_progress: whether to show a progress bar while loading. Default: True
+ sitemap_url: Custom sitemap URL to use when load_all_paths is True.
+ Defaults to "{base_url}/sitemap.xml".
"""
self.base_url = base_url or web_page
if self.base_url.endswith("/"):
self.base_url = self.base_url[:-1]
+
if load_all_paths:
# set web_path to the sitemap if we want to crawl all paths
- web_page = f"{self.base_url}/sitemap.xml"
+ if sitemap_url:
+ web_page = sitemap_url
+ else:
+ web_page = f"{self.base_url}/sitemap.xml"
+
super().__init__(
web_paths=(web_page,),
continue_on_failure=continue_on_failure,
diff --git a/libs/community/tests/integration_tests/document_loaders/test_gitbook.py b/libs/community/tests/integration_tests/document_loaders/test_gitbook.py
index 151f21be7e439..ee1d2d769344c 100644
--- a/libs/community/tests/integration_tests/document_loaders/test_gitbook.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_gitbook.py
@@ -54,3 +54,24 @@ def test_load_multiple_pages(self, web_page: str) -> None:
result = loader.load()
print(len(result)) # noqa: T201
assert len(result) > 10
+
+ @pytest.mark.parametrize(
+ "web_page, sitemap_url, expected_web_path",
+ [
+ (
+ "https://example.com/",
+ "https://example.com/custom-sitemap.xml",
+ "https://example.com/custom-sitemap.xml",
+ ),
+ ],
+ )
+ def test_init_with_custom_sitemap(
+ self,
+ web_page: str,
+ sitemap_url: str,
+ expected_web_path: str,
+ ) -> None:
+ """Test that the custom sitemap URL is correctly used when provided."""
+ loader = GitbookLoader(web_page, load_all_paths=True, sitemap_url=sitemap_url)
+ assert loader.web_path == expected_web_path
+ assert loader.load_all_paths
diff --git a/libs/community/tests/unit_tests/document_loaders/test_gitbook.py b/libs/community/tests/unit_tests/document_loaders/test_gitbook.py
new file mode 100644
index 0000000000000..f429d40037496
--- /dev/null
+++ b/libs/community/tests/unit_tests/document_loaders/test_gitbook.py
@@ -0,0 +1,204 @@
+from typing import Any, Tuple
+from unittest.mock import MagicMock, patch
+
+import pytest
+from bs4 import BeautifulSoup
+
+from langchain_community.document_loaders.gitbook import GitbookLoader
+
+
+@pytest.fixture
+def mock_soups() -> Tuple[BeautifulSoup, BeautifulSoup]:
+ # Create mock soup with loc elements for sitemap testing
+ sitemap_content = """
+
+ https://example.com/page1
+ https://example.com/page2
+ https://example.com/page3
+
+ """
+ mock_sitemap_soup = BeautifulSoup(sitemap_content, "html.parser")
+
+ # Create mock soup for page content
+ page_content = """
+
+
+
+ Test Page
+ This is test content.
+
+
+
+ """
+ mock_page_soup = BeautifulSoup(page_content, "html.parser")
+ return mock_sitemap_soup, mock_page_soup
+
+
+@patch("langchain_community.document_loaders.web_base.requests.get")
+def test_init_with_default_sitemap(mock_get: MagicMock) -> None:
+ # Test that the loader uses the default sitemap URL when load_all_paths=True
+ loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)
+
+ # Check that the web_path was set to the default sitemap URL
+ assert loader.web_paths[0] == "https://example.com/sitemap.xml"
+
+
+@patch("langchain_community.document_loaders.web_base.requests.get")
+def test_init_with_custom_sitemap(mock_get: MagicMock) -> None:
+ # Test that the loader uses the provided sitemap URL when specified
+ custom_sitemap = "https://example.com/sitemap-pages.xml"
+ loader = GitbookLoader(
+ web_page="https://example.com",
+ load_all_paths=True,
+ sitemap_url=custom_sitemap,
+ )
+
+ # Check that the web_path was set to the custom sitemap URL
+ assert loader.web_paths[0] == custom_sitemap
+
+
+@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
+@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all")
+def test_lazy_load_with_custom_sitemap(
+ mock_scrape_all: MagicMock,
+ mock_scrape: MagicMock,
+ mock_soups: Tuple[BeautifulSoup, BeautifulSoup],
+) -> None:
+ # Setup the mocks
+ mock_sitemap_soup, mock_page_soup = mock_soups
+ mock_scrape.return_value = mock_sitemap_soup
+ mock_scrape_all.return_value = [
+ mock_page_soup,
+ mock_page_soup,
+ mock_page_soup,
+ ]
+
+ # Create loader with custom sitemap URL
+ loader = GitbookLoader(
+ web_page="https://example.com",
+ load_all_paths=True,
+ sitemap_url="https://example.com/sitemap-pages.xml",
+ )
+
+ # Get the documents
+ docs = list(loader.lazy_load())
+
+ # Check that we got docs for each path in the sitemap
+ assert len(docs) == 3
+ for doc in docs:
+ assert doc.metadata["title"] == "Test Page"
+ assert "This is test content." in doc.page_content
+
+
+@patch("langchain_community.document_loaders.web_base.requests.get")
+def test_with_single_page(mock_get: MagicMock) -> None:
+ # Test loading a single page (load_all_paths=False)
+ loader = GitbookLoader(web_page="https://example.com/page", load_all_paths=False)
+
+ # Check that sitemap URL logic was not applied
+ assert loader.web_paths[0] == "https://example.com/page"
+
+
+@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
+def test_get_paths_extraction(
+ mock_scrape: MagicMock, mock_soups: Tuple[BeautifulSoup, BeautifulSoup]
+) -> None:
+ # Test that _get_paths correctly extracts paths from sitemap
+ mock_sitemap_soup, _ = mock_soups
+ mock_scrape.return_value = mock_sitemap_soup
+
+ loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)
+
+ soup_info = loader.scrape()
+ paths = loader._get_paths(soup_info)
+
+ # Check that paths were extracted correctly
+ assert len(paths) == 3
+ assert paths == ["/page1", "/page2", "/page3"]
+
+
+@patch("requests.get")
+def test_integration_with_different_sitemaps(mock_get: MagicMock) -> None:
+ # This test simulates the reported issue with different sitemap formats
+
+ # Mock response for default sitemap (empty content)
+ empty_resp = MagicMock()
+ empty_resp.text = ""
+ empty_resp.status_code = 200
+
+ # Mock response for custom sitemap (with content)
+ custom_resp = MagicMock()
+ custom_resp.text = """
+
+ https://docs.gitbook.com/page1
+ https://docs.gitbook.com/page2
+
+ """
+ custom_resp.status_code = 200
+
+ # Mock response for the actual pages
+ page_resp = MagicMock()
+ page_resp.text = """
+ Page
Content
+ """
+ page_resp.status_code = 200
+
+ # Define side effect to return different responses based on URL
+ def side_effect(url: str, *args: Any, **kwargs: Any) -> MagicMock:
+ if url == "https://docs.gitbook.com/sitemap.xml":
+ return empty_resp
+ elif url == "https://docs.gitbook.com/sitemap-pages.xml":
+ return custom_resp
+ else:
+ return page_resp
+
+ mock_get.side_effect = side_effect
+
+ # Test with default sitemap (should result in no docs)
+ with patch(
+ "langchain_community.document_loaders.web_base.requests.get",
+ side_effect=side_effect,
+ ):
+ with patch(
+ "langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
+ ) as mock_scrape:
+ with patch(
+ "langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
+ ) as mock_scrape_all:
+ mock_scrape.return_value = BeautifulSoup(
+ "", "html.parser"
+ )
+ mock_scrape_all.return_value = []
+
+ loader1 = GitbookLoader(
+ web_page="https://docs.gitbook.com/", load_all_paths=True
+ )
+ docs1 = list(loader1.lazy_load())
+ assert len(docs1) == 0
+
+ # Test with custom sitemap (should result in docs)
+ with patch(
+ "langchain_community.document_loaders.web_base.requests.get",
+ side_effect=side_effect,
+ ):
+ with patch(
+ "langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
+ ) as mock_scrape:
+ with patch(
+ "langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
+ ) as mock_scrape_all:
+ mock_scrape.return_value = BeautifulSoup(
+ custom_resp.text, "html.parser"
+ )
+ mock_scrape_all.return_value = [
+ BeautifulSoup(page_resp.text, "html.parser"),
+ BeautifulSoup(page_resp.text, "html.parser"),
+ ]
+
+ loader2 = GitbookLoader(
+ web_page="https://docs.gitbook.com/",
+ load_all_paths=True,
+ sitemap_url="https://docs.gitbook.com/sitemap-pages.xml",
+ )
+ docs2 = list(loader2.lazy_load())
+ assert len(docs2) == 2