Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions libs/community/langchain_community/document_loaders/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
import itertools
import re
from typing import Any, Callable, Generator, Iterable, Iterator, List, Optional, Tuple
from typing import (
Any,
Callable,
Dict,
Generator,
Iterable,
Iterator,
List,
Optional,
Tuple,
)
from urllib.parse import urlparse

from langchain_core.documents import Document
Expand Down Expand Up @@ -75,6 +85,7 @@ def __init__(
is_local: bool = False,
continue_on_failure: bool = False,
restrict_to_same_domain: bool = True,
max_depth: int = 10,
**kwargs: Any,
):
"""Initialize with webpage path and optional filter URLs.
Expand Down Expand Up @@ -105,6 +116,7 @@ def __init__(
restrict_to_same_domain: whether to restrict loading to URLs to the same
domain as the sitemap. Attention: This is only applied if the sitemap
is not a local file!
max_depth: maximum depth to follow sitemap links. Default: 10
"""

if blocksize is not None and blocksize < 1:
Expand Down Expand Up @@ -134,17 +146,23 @@ def __init__(
self.blocknum = blocknum
self.is_local = is_local
self.continue_on_failure = continue_on_failure
self.max_depth = max_depth

def parse_sitemap(self, soup: Any) -> List[dict]:
def parse_sitemap(self, soup: Any, *, depth: int = 0) -> List[dict]:
"""Parse sitemap xml and load into a list of dicts.

Args:
soup: BeautifulSoup object.
depth: current depth of the sitemap. Default: 0

Returns:
List of dicts.
"""
els = []
if depth >= self.max_depth:
return []

els: List[Dict] = []

for url in soup.find_all("url"):
loc = url.find("loc")
if not loc:
Expand Down Expand Up @@ -177,9 +195,9 @@ def parse_sitemap(self, soup: Any) -> List[dict]:
loc = sitemap.find("loc")
if not loc:
continue
soup_child = self.scrape_all([loc.text], "xml")[0]

els.extend(self.parse_sitemap(soup_child))
soup_child = self.scrape_all([loc.text], "xml")[0]
els.extend(self.parse_sitemap(soup_child, depth=depth + 1))
return els

def lazy_load(self) -> Iterator[Document]:
Expand Down