Fixed SharePoint connector polling (#3834)

* Fixed SharePoint connector polling * finish * fix sharepoint connector
onyx-dot-app · Jan 30, 2025 · aabf8a9 · aabf8a9
1 parent 95701db
commit aabf8a9
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 17 deletions.
diff --git a/backend/onyx/connectors/sharepoint/connector.py b/backend/onyx/connectors/sharepoint/connector.py
@@ -127,13 +127,6 @@ def _fetch_driveitems(
         start: datetime | None = None,
         end: datetime | None = None,
     ) -> list[tuple[DriveItem, str]]:
-        filter_str = ""
-        if start is not None and end is not None:
-            filter_str = (
-                f"last_modified_datetime ge {start.isoformat()} and "
-                f"last_modified_datetime le {end.isoformat()}"
-            )
-
         final_driveitems: list[tuple[DriveItem, str]] = []
         try:
             site = self.graph_client.sites.get_by_url(site_descriptor.url)
@@ -167,9 +160,10 @@ def _fetch_driveitems(
                             root_folder = root_folder.get_by_path(folder_part)
 
                     # Get all items recursively
-                    query = root_folder.get_files(True, 1000)
-                    if filter_str:
-                        query = query.filter(filter_str)
+                    query = root_folder.get_files(
+                        recursive=True,
+                        page_size=1000,
+                    )
                     driveitems = query.execute_query()
                     logger.debug(
                         f"Found {len(driveitems)} items in drive '{drive.name}'"
@@ -180,11 +174,12 @@ def _fetch_driveitems(
                         "Shared Documents" if drive.name == "Documents" else drive.name
                     )
 
+                    # Filter items based on folder path if specified
                     if site_descriptor.folder_path:
                         # Filter items to ensure they're in the specified folder or its subfolders
                         # The path will be in format: /drives/{drive_id}/root:/folder/path
-                        filtered_driveitems = [
-                            (item, drive_name)
+                        driveitems = [
+                            item
                             for item in driveitems
                             if any(
                                 path_part == site_descriptor.folder_path
@@ -196,19 +191,31 @@ def _fetch_driveitems(
                                 )[1].split("/")
                             )
                         ]
-                        if len(filtered_driveitems) == 0:
+                        if len(driveitems) == 0:
                             all_paths = [
                                 item.parent_reference.path for item in driveitems
                             ]
                             logger.warning(
                                 f"Nothing found for folder '{site_descriptor.folder_path}' "
                                 f"in; any of valid paths: {all_paths}"
                             )
-                        final_driveitems.extend(filtered_driveitems)
-                    else:
-                        final_driveitems.extend(
-                            [(item, drive_name) for item in driveitems]
+
+                    # Filter items based on time window if specified
+                    if start is not None and end is not None:
+                        driveitems = [
+                            item
+                            for item in driveitems
+                            if start
+                            <= item.last_modified_datetime.replace(tzinfo=timezone.utc)
+                            <= end
+                        ]
+                        logger.debug(
+                            f"Found {len(driveitems)} items within time window in drive '{drive.name}'"
                         )
+
+                    for item in driveitems:
+                        final_driveitems.append((item, drive_name))
+
                 except Exception as e:
                     # Some drives might not be accessible
                     logger.warning(f"Failed to process drive: {str(e)}")

diff --git a/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py b/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py
@@ -176,3 +176,35 @@ def test_sharepoint_connector_other_library(
     for expected in expected_documents:
         doc = find_document(found_documents, expected.semantic_identifier)
         verify_document_content(doc, expected)
+
+
+def test_sharepoint_connector_poll(
+    mock_get_unstructured_api_key: MagicMock,
+    sharepoint_credentials: dict[str, str],
+) -> None:
+    # Initialize connector with the base site URL
+    connector = SharepointConnector(
+        sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests"]
+    )
+
+    # Load credentials
+    connector.load_credentials(sharepoint_credentials)
+
+    # Set time window to only capture test1.docx (modified at 2025-01-28 20:51:42+00:00)
+    start = datetime(2025, 1, 28, 20, 51, 30, tzinfo=timezone.utc)  # 12 seconds before
+    end = datetime(2025, 1, 28, 20, 51, 50, tzinfo=timezone.utc)  # 8 seconds after
+
+    # Get documents within the time window
+    document_batches = list(connector._fetch_from_sharepoint(start=start, end=end))
+    found_documents: list[Document] = [
+        doc for batch in document_batches for doc in batch
+    ]
+
+    # Should only find test1.docx
+    assert len(found_documents) == 1, "Should only find one document in the time window"
+    doc = found_documents[0]
+    assert doc.semantic_identifier == "test1.docx"
+    verify_document_metadata(doc)
+    verify_document_content(
+        doc, [d for d in EXPECTED_DOCUMENTS if d.semantic_identifier == "test1.docx"][0]
+    )