Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed SharePoint connector polling #3834

Merged
merged 3 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 25 additions & 17 deletions backend/onyx/connectors/sharepoint/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,6 @@ def _fetch_driveitems(
start: datetime | None = None,
end: datetime | None = None,
) -> list[tuple[DriveItem, str]]:
filter_str = ""
if start is not None and end is not None:
filter_str = (
f"last_modified_datetime ge {start.isoformat()} and "
f"last_modified_datetime le {end.isoformat()}"
)

final_driveitems: list[tuple[DriveItem, str]] = []
try:
site = self.graph_client.sites.get_by_url(site_descriptor.url)
Expand Down Expand Up @@ -167,9 +160,10 @@ def _fetch_driveitems(
root_folder = root_folder.get_by_path(folder_part)

# Get all items recursively
query = root_folder.get_files(True, 1000)
if filter_str:
query = query.filter(filter_str)
query = root_folder.get_files(
recursive=True,
page_size=1000,
)
driveitems = query.execute_query()
logger.debug(
f"Found {len(driveitems)} items in drive '{drive.name}'"
Expand All @@ -180,11 +174,12 @@ def _fetch_driveitems(
"Shared Documents" if drive.name == "Documents" else drive.name
)

# Filter items based on folder path if specified
if site_descriptor.folder_path:
# Filter items to ensure they're in the specified folder or its subfolders
# The path will be in format: /drives/{drive_id}/root:/folder/path
filtered_driveitems = [
(item, drive_name)
driveitems = [
item
for item in driveitems
if any(
path_part == site_descriptor.folder_path
Expand All @@ -196,19 +191,32 @@ def _fetch_driveitems(
)[1].split("/")
)
]
if len(filtered_driveitems) == 0:
if len(driveitems) == 0:
all_paths = [
item.parent_reference.path for item in driveitems
]
logger.warning(
f"Nothing found for folder '{site_descriptor.folder_path}' "
f"in; any of valid paths: {all_paths}"
)
final_driveitems.extend(filtered_driveitems)
else:
final_driveitems.extend(
[(item, drive_name) for item in driveitems]

# Filter items based on time window if specified
if start is not None and end is not None:
driveitems = [
item
for item in driveitems
if start
<= item.last_modified_datetime.replace(tzinfo=timezone.utc)
<= end
]
logger.debug(
f"Found {len(driveitems)} items within time window in drive '{drive.name}'"
)

# Now fetch content only for items that passed all filters
for item in driveitems:
final_driveitems.append((item, drive_name))

except Exception as e:
# Some drives might not be accessible
logger.warning(f"Failed to process drive: {str(e)}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,35 @@ def test_sharepoint_connector_other_library(
for expected in expected_documents:
doc = find_document(found_documents, expected.semantic_identifier)
verify_document_content(doc, expected)


def test_sharepoint_connector_poll(
mock_get_unstructured_api_key: MagicMock,
sharepoint_credentials: dict[str, str],
) -> None:
# Initialize connector with the base site URL
connector = SharepointConnector(
sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests"]
)

# Load credentials
connector.load_credentials(sharepoint_credentials)

# Set time window to only capture test1.docx (modified at 2025-01-28 20:51:42+00:00)
start = datetime(2025, 1, 28, 20, 51, 30, tzinfo=timezone.utc) # 12 seconds before
end = datetime(2025, 1, 28, 20, 51, 50, tzinfo=timezone.utc) # 8 seconds after

# Get documents within the time window
document_batches = list(connector._fetch_from_sharepoint(start=start, end=end))
found_documents: list[Document] = [
doc for batch in document_batches for doc in batch
]

# Should only find test1.docx
assert len(found_documents) == 1, "Should only find one document in the time window"
doc = found_documents[0]
assert doc.semantic_identifier == "test1.docx"
verify_document_metadata(doc)
verify_document_content(
doc, [d for d in EXPECTED_DOCUMENTS if d.semantic_identifier == "test1.docx"][0]
)
Loading