deepset-ai · ZanSara · Jul 1, 2022 · Jun 22, 2022 · Jun 23, 2022 · Jun 23, 2022
diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md
@@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
 #### Crawler.\_\_init\_\_
 
 ```python
-def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True)
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None)
 ```
 
 Init object with basic params for crawling (can be overwritten later).
@@ -48,13 +48,16 @@ not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.
 In this case the id will be generated by using the content and the defined metadata.
 - `extract_hidden_text`: Whether to extract the hidden text contained in page.
 E.g. the text can be inside a span with style="display: none"
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
 
 <a id="crawler.Crawler.crawl"></a>
 
 #### Crawler.crawl
 
 ```python
-def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None) -> List[Path]
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None) -> List[Path]
 ```
 
 Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@@ -78,6 +81,9 @@ All URLs not matching at least one of the regular expressions will be dropped.
 attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 In this case the id will be generated by using the content and the defined metadata.
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
 
 **Returns**:
 
@@ -88,7 +94,7 @@ List of paths where the crawled webpages got stored
 #### Crawler.run
 
 ```python
-def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
 ```
 
 Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@@ -110,6 +116,9 @@ not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.
 In this case the id will be generated by using the content and the defined metadata.
 - `extract_hidden_text`: Whether to extract the hidden text contained in page.
 E.g. the text can be inside a span with style="display: none"
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
 
 **Returns**:
 

diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json
@@ -1989,6 +1989,10 @@
             "extract_hidden_text": {
               "title": "Extract Hidden Text",
               "default": true
+            },
+            "loading_wait_time": {
+              "title": "Loading Wait Time",
+              "type": "integer"
             }
           },
           "required": [

diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py
@@ -3,6 +3,7 @@
 import re
 import sys
 import json
+import time
 import logging
 from pathlib import Path
 from urllib.parse import urlparse
@@ -11,6 +12,7 @@
     from webdriver_manager.chrome import ChromeDriverManager
     from selenium.webdriver.chrome.service import Service
     from selenium.webdriver.common.by import By
+    from selenium.common.exceptions import StaleElementReferenceException
     from selenium import webdriver
 except (ImportError, ModuleNotFoundError) as ie:
     from haystack.utils.import_utils import _optional_component_not_installed
@@ -50,6 +52,7 @@ def __init__(
         overwrite_existing_files=True,
         id_hash_keys: Optional[List[str]] = None,
         extract_hidden_text=True,
+        loading_wait_time: Optional[int] = None,
     ):
         """
         Init object with basic params for crawling (can be overwritten later).
@@ -68,6 +71,9 @@ def __init__(
             In this case the id will be generated by using the content and the defined metadata.
         :param extract_hidden_text: Whether to extract the hidden text contained in page.
             E.g. the text can be inside a span with style="display: none"
+        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
+            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+            E.g. 2: Crawler will wait 2 seconds before scraping page
         """
         super().__init__()
 
@@ -99,6 +105,7 @@ def __init__(
         self.overwrite_existing_files = overwrite_existing_files
         self.id_hash_keys = id_hash_keys
         self.extract_hidden_text = extract_hidden_text
+        self.loading_wait_time = loading_wait_time
 
     def crawl(
         self,
@@ -109,6 +116,7 @@ def crawl(
         overwrite_existing_files: Optional[bool] = None,
         id_hash_keys: Optional[List[str]] = None,
         extract_hidden_text: Optional[bool] = None,
+        loading_wait_time: Optional[int] = None,
     ) -> List[Path]:
         """
         Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@@ -129,6 +137,9 @@ def crawl(
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
             In this case the id will be generated by using the content and the defined metadata.
+        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
+            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+            E.g. 2: Crawler will wait 2 seconds before scraping page
 
         :return: List of paths where the crawled webpages got stored
         """
@@ -147,6 +158,8 @@ def crawl(
             crawler_depth = self.crawler_depth
         if extract_hidden_text is None:
             extract_hidden_text = self.extract_hidden_text
+        if loading_wait_time is None:
+            loading_wait_time = self.loading_wait_time
 
         output_dir = Path(output_dir)
         if not output_dir.exists():
@@ -165,18 +178,29 @@ def crawl(
                 for url in urls:
                     if pattern.search(url):
                         file_paths += self._write_to_files(
-                            [url], output_dir=output_dir, extract_hidden_text=extract_hidden_text
+                            [url],
+                            output_dir=output_dir,
+                            extract_hidden_text=extract_hidden_text,
+                            loading_wait_time=loading_wait_time,
                         )
             else:
-                file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text)
+                file_paths += self._write_to_files(
+                    urls,
+                    output_dir=output_dir,
+                    extract_hidden_text=extract_hidden_text,
+                    loading_wait_time=loading_wait_time,
+                )
             # follow one level of sublinks if requested
             if crawler_depth == 1:
                 sub_links: Dict[str, List] = {}
                 for url_ in urls:
                     already_found_links: List = list(sum(list(sub_links.values()), []))
                     sub_links[url_] = list(
                         self._extract_sublinks_from_url(
-                            base_url=url_, filter_urls=filter_urls, already_found_links=already_found_links
+                            base_url=url_,
+                            filter_urls=filter_urls,
+                            already_found_links=already_found_links,
+                            loading_wait_time=loading_wait_time,
                         )
                     )
                 for url, extracted_sublink in sub_links.items():
@@ -186,6 +210,7 @@ def crawl(
                         base_url=url,
                         id_hash_keys=id_hash_keys,
                         extract_hidden_text=extract_hidden_text,
+                        loading_wait_time=loading_wait_time,
                     )
 
         return file_paths
@@ -197,11 +222,14 @@ def _write_to_files(
         extract_hidden_text: bool,
         base_url: str = None,
         id_hash_keys: Optional[List[str]] = None,
+        loading_wait_time: Optional[int] = None,
     ) -> List[Path]:
         paths = []
         for link in urls:
             logger.info(f"writing contents from `{link}`")
             self.driver.get(link)
+            if loading_wait_time is not None:
+                time.sleep(loading_wait_time)
             el = self.driver.find_element(by=By.TAG_NAME, value="body")
             if extract_hidden_text:
                 text = el.get_attribute("textContent")
@@ -234,6 +262,7 @@ def run(  # type: ignore
         return_documents: Optional[bool] = False,
         id_hash_keys: Optional[List[str]] = None,
         extract_hidden_text: Optional[bool] = True,
+        loading_wait_time: Optional[int] = None,
     ) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
         """
         Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@@ -253,6 +282,9 @@ def run(  # type: ignore
             In this case the id will be generated by using the content and the defined metadata.
         :param extract_hidden_text: Whether to extract the hidden text contained in page.
             E.g. the text can be inside a span with style="display: none"
+        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
+            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+            E.g. 2: Crawler will wait 2 seconds before scraping page
 
         :return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
         """
@@ -264,6 +296,7 @@ def run(  # type: ignore
             filter_urls=filter_urls,
             overwrite_existing_files=overwrite_existing_files,
             extract_hidden_text=extract_hidden_text,
+            loading_wait_time=loading_wait_time,
         )
         results: Dict[str, Union[List[Document], List[Path]]] = {}
         if return_documents:
@@ -287,6 +320,7 @@ def run_batch(  # type: ignore
         return_documents: Optional[bool] = False,
         id_hash_keys: Optional[List[str]] = None,
         extract_hidden_text: Optional[bool] = True,
+        loading_wait_time: Optional[int] = None,
     ):
         return self.run(
             output_dir=output_dir,
@@ -297,6 +331,7 @@ def run_batch(  # type: ignore
             return_documents=return_documents,
             id_hash_keys=id_hash_keys,
             extract_hidden_text=extract_hidden_text,
+            loading_wait_time=loading_wait_time,
         )
 
     @staticmethod
@@ -312,17 +347,30 @@ def _is_inpage_navigation(base_url: str, sub_link: str) -> bool:
         return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc
 
     def _extract_sublinks_from_url(
-        self, base_url: str, filter_urls: Optional[List] = None, already_found_links: List = None
+        self,
+        base_url: str,
+        filter_urls: Optional[List] = None,
+        already_found_links: List = None,
+        loading_wait_time: Optional[int] = None,
     ) -> set:
         if filter_urls:
             filter_pattern = re.compile("|".join(filter_urls))
 
         self.driver.get(base_url)
+        if loading_wait_time is not None:
+            time.sleep(loading_wait_time)
         a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
         sub_links = set()
 
         for i in a_elements:
-            sub_link = i.get_attribute("href")
+            try:
+                sub_link = i.get_attribute("href")
+            except StaleElementReferenceException as error:
+                logger.error(
+                    "The crawler couldn't find the link anymore. It has probably been removed from DOM by JavaScript."
+                )
+                continue
+
             if not (already_found_links and sub_link in already_found_links):
                 if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
                     not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)

diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py
@@ -1,11 +1,14 @@
 from typing import List
 
 import json
+import time
 from pathlib import Path
 
 import pytest
 from selenium.webdriver.common.by import By
 
+from selenium.webdriver.common.by import By
+
 from haystack.nodes.connector import Crawler
 from haystack.schema import Document
 
@@ -24,6 +27,7 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
     :param crawled_page: the output of Crawler (one element of the paths list)
     """
     crawler.driver.get(url)
+
     body = crawler.driver.find_element(by=By.TAG_NAME, value="body")
 
     if crawler.extract_hidden_text:
@@ -36,7 +40,9 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
         return page_data["content"] == expected_crawled_content
 
 
-def content_in_results(crawler: Crawler, url: str, results: List[Path], expected_matches_count=1):
+def content_in_results(
+    crawler: Crawler, url: str, results: List[Path], expected_matches_count=1, loading_wait_time: int = None
+):
     """
     Makes sure there is exactly one matching page in the list of pages returned
     by the crawler.
@@ -134,7 +140,7 @@ def test_crawler_filter_urls(test_url, tmp_path):
     paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
     assert len(paths) == 1
     assert content_match(crawler, test_url + "/page1.html", paths[0])
-    assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)
+    assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1)
 
 
 def test_crawler_return_document(test_url, tmp_path):
@@ -162,3 +168,23 @@ def test_crawler_extract_hidden_text(test_url, tmp_path):
     )
     crawled_content = documents["documents"][0].content
     assert "hidden text" not in crawled_content
+
+
+def test_crawler_loading_wait_time(test_url, tmp_path):
+    loading_wait_time = 3
+    crawler = Crawler(output_dir=tmp_path)
+    paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time)
+
+    assert len(paths) == 4
+
+    with open(f"{SAMPLES_PATH.absolute()}/crawler/page_dynamic_result.txt", "r") as dynamic_result:
+        dynamic_result_text = dynamic_result.read()
+        for path in paths:
+            with open(path, "r") as crawled_file:
+                page_data = json.load(crawled_file)
+                if page_data["meta"]["url"] == test_url + "/page_dynamic.html":
+                    assert dynamic_result_text == page_data["content"]
+
+    assert content_in_results(crawler, test_url + "/index.html", paths)
+    assert content_in_results(crawler, test_url + "/page1.html", paths)
+    assert content_in_results(crawler, test_url + "/page2.html", paths)
diff --git a/test/samples/crawler/page_dynamic.html b/test/samples/crawler/page_dynamic.html
@@ -0,0 +1,33 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test Dynamic Page</title>   
+</head>
+<body>
+    <p>home page content</p>
+    <div id="content">
+      <a href="index.html" id="a1">link to index</a>
+      <a href="page_w_hidden_text.html" id="a2">link to page with hidden text</a>
+      <a href="page1.html" id="a3">link to page 1</a>
+    </div>
+    <script>
+      const updateTimeout = setTimeout(myUpdateFunction, 150);
+
+      function myUpdateFunction() {
+        const remElem = document.querySelector('#a2');
+        if (remElem)        
+          remElem.parentNode.removeChild(remElem);
+
+        if (!document.querySelector('#a4')) {
+          const newElem =  document.createElement('a');
+          newElem.href = 'page2.html';
+          newElem.id = 'a4';
+          newElem.innerText = 'link to page 2';
+          document.body.appendChild(newElem);
+        }
+
+        clearTimeout(updateTimeout);
+      }
+    </script>   
+</body>
+</html>
diff --git a/test/samples/crawler/page_dynamic_result.txt b/test/samples/crawler/page_dynamic_result.txt
@@ -0,0 +1,28 @@
+
+    home page content
+
+      link to index
+
+      link to page 1
+
+
+      const updateTimeout = setTimeout(myUpdateFunction, 150);
+
+      function myUpdateFunction() {
+        const remElem = document.querySelector('#a2');
+        if (remElem)        
+          remElem.parentNode.removeChild(remElem);
+
+        if (!document.querySelector('#a4')) {
+          const newElem =  document.createElement('a');
+          newElem.href = 'page2.html';
+          newElem.id = 'a4';
+          newElem.innerText = 'link to page 2';
+          document.body.appendChild(newElem);
+        }
+
+        clearTimeout(updateTimeout);
+      }
+
+
+link to page 2