diff --git a/libs/community/langchain_community/document_loaders/gitbook.py b/libs/community/langchain_community/document_loaders/gitbook.py index 816d68c33f610..d74b4cd4763fb 100644 --- a/libs/community/langchain_community/document_loaders/gitbook.py +++ b/libs/community/langchain_community/document_loaders/gitbook.py @@ -21,6 +21,8 @@ def __init__( content_selector: str = "main", continue_on_failure: bool = False, show_progress: bool = True, + *, + sitemap_url: Optional[str] = None, ): """Initialize with web page and whether to load all paths. @@ -38,13 +40,20 @@ def __init__( exception. Setting this to True makes the loader more robust, but also may result in missing data. Default: False show_progress: whether to show a progress bar while loading. Default: True + sitemap_url: Custom sitemap URL to use when load_all_paths is True. + Defaults to "{base_url}/sitemap.xml". """ self.base_url = base_url or web_page if self.base_url.endswith("/"): self.base_url = self.base_url[:-1] + if load_all_paths: # set web_path to the sitemap if we want to crawl all paths - web_page = f"{self.base_url}/sitemap.xml" + if sitemap_url: + web_page = sitemap_url + else: + web_page = f"{self.base_url}/sitemap.xml" + super().__init__( web_paths=(web_page,), continue_on_failure=continue_on_failure, diff --git a/libs/community/tests/integration_tests/document_loaders/test_gitbook.py b/libs/community/tests/integration_tests/document_loaders/test_gitbook.py index 151f21be7e439..ee1d2d769344c 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_gitbook.py +++ b/libs/community/tests/integration_tests/document_loaders/test_gitbook.py @@ -54,3 +54,24 @@ def test_load_multiple_pages(self, web_page: str) -> None: result = loader.load() print(len(result)) # noqa: T201 assert len(result) > 10 + + @pytest.mark.parametrize( + "web_page, sitemap_url, expected_web_path", + [ + ( + "https://example.com/", + "https://example.com/custom-sitemap.xml", + "https://example.com/custom-sitemap.xml", + ), + ], + ) + def test_init_with_custom_sitemap( + self, + web_page: str, + sitemap_url: str, + expected_web_path: str, + ) -> None: + """Test that the custom sitemap URL is correctly used when provided.""" + loader = GitbookLoader(web_page, load_all_paths=True, sitemap_url=sitemap_url) + assert loader.web_path == expected_web_path + assert loader.load_all_paths diff --git a/libs/community/tests/unit_tests/document_loaders/test_gitbook.py b/libs/community/tests/unit_tests/document_loaders/test_gitbook.py new file mode 100644 index 0000000000000..f429d40037496 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_gitbook.py @@ -0,0 +1,204 @@ +from typing import Any, Tuple +from unittest.mock import MagicMock, patch + +import pytest +from bs4 import BeautifulSoup + +from langchain_community.document_loaders.gitbook import GitbookLoader + + +@pytest.fixture +def mock_soups() -> Tuple[BeautifulSoup, BeautifulSoup]: + # Create mock soup with loc elements for sitemap testing + sitemap_content = """ + + https://example.com/page1 + https://example.com/page2 + https://example.com/page3 + + """ + mock_sitemap_soup = BeautifulSoup(sitemap_content, "html.parser") + + # Create mock soup for page content + page_content = """ + + +
+

Test Page

+

This is test content.

+
+ + + """ + mock_page_soup = BeautifulSoup(page_content, "html.parser") + return mock_sitemap_soup, mock_page_soup + + +@patch("langchain_community.document_loaders.web_base.requests.get") +def test_init_with_default_sitemap(mock_get: MagicMock) -> None: + # Test that the loader uses the default sitemap URL when load_all_paths=True + loader = GitbookLoader(web_page="https://example.com", load_all_paths=True) + + # Check that the web_path was set to the default sitemap URL + assert loader.web_paths[0] == "https://example.com/sitemap.xml" + + +@patch("langchain_community.document_loaders.web_base.requests.get") +def test_init_with_custom_sitemap(mock_get: MagicMock) -> None: + # Test that the loader uses the provided sitemap URL when specified + custom_sitemap = "https://example.com/sitemap-pages.xml" + loader = GitbookLoader( + web_page="https://example.com", + load_all_paths=True, + sitemap_url=custom_sitemap, + ) + + # Check that the web_path was set to the custom sitemap URL + assert loader.web_paths[0] == custom_sitemap + + +@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape") +@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all") +def test_lazy_load_with_custom_sitemap( + mock_scrape_all: MagicMock, + mock_scrape: MagicMock, + mock_soups: Tuple[BeautifulSoup, BeautifulSoup], +) -> None: + # Setup the mocks + mock_sitemap_soup, mock_page_soup = mock_soups + mock_scrape.return_value = mock_sitemap_soup + mock_scrape_all.return_value = [ + mock_page_soup, + mock_page_soup, + mock_page_soup, + ] + + # Create loader with custom sitemap URL + loader = GitbookLoader( + web_page="https://example.com", + load_all_paths=True, + sitemap_url="https://example.com/sitemap-pages.xml", + ) + + # Get the documents + docs = list(loader.lazy_load()) + + # Check that we got docs for each path in the sitemap + assert len(docs) == 3 + for doc in docs: + assert doc.metadata["title"] == "Test Page" + assert "This is test content." in doc.page_content + + +@patch("langchain_community.document_loaders.web_base.requests.get") +def test_with_single_page(mock_get: MagicMock) -> None: + # Test loading a single page (load_all_paths=False) + loader = GitbookLoader(web_page="https://example.com/page", load_all_paths=False) + + # Check that sitemap URL logic was not applied + assert loader.web_paths[0] == "https://example.com/page" + + +@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape") +def test_get_paths_extraction( + mock_scrape: MagicMock, mock_soups: Tuple[BeautifulSoup, BeautifulSoup] +) -> None: + # Test that _get_paths correctly extracts paths from sitemap + mock_sitemap_soup, _ = mock_soups + mock_scrape.return_value = mock_sitemap_soup + + loader = GitbookLoader(web_page="https://example.com", load_all_paths=True) + + soup_info = loader.scrape() + paths = loader._get_paths(soup_info) + + # Check that paths were extracted correctly + assert len(paths) == 3 + assert paths == ["/page1", "/page2", "/page3"] + + +@patch("requests.get") +def test_integration_with_different_sitemaps(mock_get: MagicMock) -> None: + # This test simulates the reported issue with different sitemap formats + + # Mock response for default sitemap (empty content) + empty_resp = MagicMock() + empty_resp.text = "" + empty_resp.status_code = 200 + + # Mock response for custom sitemap (with content) + custom_resp = MagicMock() + custom_resp.text = """ + + https://docs.gitbook.com/page1 + https://docs.gitbook.com/page2 + + """ + custom_resp.status_code = 200 + + # Mock response for the actual pages + page_resp = MagicMock() + page_resp.text = """ +

Page

Content

+ """ + page_resp.status_code = 200 + + # Define side effect to return different responses based on URL + def side_effect(url: str, *args: Any, **kwargs: Any) -> MagicMock: + if url == "https://docs.gitbook.com/sitemap.xml": + return empty_resp + elif url == "https://docs.gitbook.com/sitemap-pages.xml": + return custom_resp + else: + return page_resp + + mock_get.side_effect = side_effect + + # Test with default sitemap (should result in no docs) + with patch( + "langchain_community.document_loaders.web_base.requests.get", + side_effect=side_effect, + ): + with patch( + "langchain_community.document_loaders.gitbook.GitbookLoader.scrape" + ) as mock_scrape: + with patch( + "langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all" + ) as mock_scrape_all: + mock_scrape.return_value = BeautifulSoup( + "", "html.parser" + ) + mock_scrape_all.return_value = [] + + loader1 = GitbookLoader( + web_page="https://docs.gitbook.com/", load_all_paths=True + ) + docs1 = list(loader1.lazy_load()) + assert len(docs1) == 0 + + # Test with custom sitemap (should result in docs) + with patch( + "langchain_community.document_loaders.web_base.requests.get", + side_effect=side_effect, + ): + with patch( + "langchain_community.document_loaders.gitbook.GitbookLoader.scrape" + ) as mock_scrape: + with patch( + "langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all" + ) as mock_scrape_all: + mock_scrape.return_value = BeautifulSoup( + custom_resp.text, "html.parser" + ) + mock_scrape_all.return_value = [ + BeautifulSoup(page_resp.text, "html.parser"), + BeautifulSoup(page_resp.text, "html.parser"), + ] + + loader2 = GitbookLoader( + web_page="https://docs.gitbook.com/", + load_all_paths=True, + sitemap_url="https://docs.gitbook.com/sitemap-pages.xml", + ) + docs2 = list(loader2.lazy_load()) + assert len(docs2) == 2