Skip to content

Commit

Permalink
Fixed SharePoint connector polling (#3834)
Browse files Browse the repository at this point in the history
* Fixed SharePoint connector polling

* finish

* fix sharepoint connector
  • Loading branch information
hagen-danswer authored Jan 30, 2025
1 parent 95701db commit aabf8a9
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 17 deletions.
41 changes: 24 additions & 17 deletions backend/onyx/connectors/sharepoint/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,6 @@ def _fetch_driveitems(
start: datetime | None = None,
end: datetime | None = None,
) -> list[tuple[DriveItem, str]]:
filter_str = ""
if start is not None and end is not None:
filter_str = (
f"last_modified_datetime ge {start.isoformat()} and "
f"last_modified_datetime le {end.isoformat()}"
)

final_driveitems: list[tuple[DriveItem, str]] = []
try:
site = self.graph_client.sites.get_by_url(site_descriptor.url)
Expand Down Expand Up @@ -167,9 +160,10 @@ def _fetch_driveitems(
root_folder = root_folder.get_by_path(folder_part)

# Get all items recursively
query = root_folder.get_files(True, 1000)
if filter_str:
query = query.filter(filter_str)
query = root_folder.get_files(
recursive=True,
page_size=1000,
)
driveitems = query.execute_query()
logger.debug(
f"Found {len(driveitems)} items in drive '{drive.name}'"
Expand All @@ -180,11 +174,12 @@ def _fetch_driveitems(
"Shared Documents" if drive.name == "Documents" else drive.name
)

# Filter items based on folder path if specified
if site_descriptor.folder_path:
# Filter items to ensure they're in the specified folder or its subfolders
# The path will be in format: /drives/{drive_id}/root:/folder/path
filtered_driveitems = [
(item, drive_name)
driveitems = [
item
for item in driveitems
if any(
path_part == site_descriptor.folder_path
Expand All @@ -196,19 +191,31 @@ def _fetch_driveitems(
)[1].split("/")
)
]
if len(filtered_driveitems) == 0:
if len(driveitems) == 0:
all_paths = [
item.parent_reference.path for item in driveitems
]
logger.warning(
f"Nothing found for folder '{site_descriptor.folder_path}' "
f"in; any of valid paths: {all_paths}"
)
final_driveitems.extend(filtered_driveitems)
else:
final_driveitems.extend(
[(item, drive_name) for item in driveitems]

# Filter items based on time window if specified
if start is not None and end is not None:
driveitems = [
item
for item in driveitems
if start
<= item.last_modified_datetime.replace(tzinfo=timezone.utc)
<= end
]
logger.debug(
f"Found {len(driveitems)} items within time window in drive '{drive.name}'"
)

for item in driveitems:
final_driveitems.append((item, drive_name))

except Exception as e:
# Some drives might not be accessible
logger.warning(f"Failed to process drive: {str(e)}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,35 @@ def test_sharepoint_connector_other_library(
for expected in expected_documents:
doc = find_document(found_documents, expected.semantic_identifier)
verify_document_content(doc, expected)


def test_sharepoint_connector_poll(
mock_get_unstructured_api_key: MagicMock,
sharepoint_credentials: dict[str, str],
) -> None:
# Initialize connector with the base site URL
connector = SharepointConnector(
sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests"]
)

# Load credentials
connector.load_credentials(sharepoint_credentials)

# Set time window to only capture test1.docx (modified at 2025-01-28 20:51:42+00:00)
start = datetime(2025, 1, 28, 20, 51, 30, tzinfo=timezone.utc) # 12 seconds before
end = datetime(2025, 1, 28, 20, 51, 50, tzinfo=timezone.utc) # 8 seconds after

# Get documents within the time window
document_batches = list(connector._fetch_from_sharepoint(start=start, end=end))
found_documents: list[Document] = [
doc for batch in document_batches for doc in batch
]

# Should only find test1.docx
assert len(found_documents) == 1, "Should only find one document in the time window"
doc = found_documents[0]
assert doc.semantic_identifier == "test1.docx"
verify_document_metadata(doc)
verify_document_content(
doc, [d for d in EXPECTED_DOCUMENTS if d.semantic_identifier == "test1.docx"][0]
)

0 comments on commit aabf8a9

Please sign in to comment.