Skip to content

Commit

Permalink
feat(reports): execute as other than selenium user (#21931)
Browse files Browse the repository at this point in the history
Co-authored-by: Ville Brofeldt <ville.brofeldt@apple.com>
  • Loading branch information
villebro and villebro authored Oct 31, 2022
1 parent c9470ca commit a02a778
Show file tree
Hide file tree
Showing 14 changed files with 517 additions and 87 deletions.
28 changes: 27 additions & 1 deletion docs/docs/installation/alerts-reports.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -371,18 +371,44 @@ to specify on behalf of which username to render the dashboards. In general dash
are not accessible to unauthorized requests, that is why the worker needs to take over credentials
of an existing user to take a snapshot.

By default, Alerts and Reports are executed as the user that the `THUMBNAIL_SELENIUM_USER` config
parameter is set to. To change this user, just change the config as follows:

```python
THUMBNAIL_SELENIUM_USER = 'username_with_permission_to_access_dashboards'
```

In addition, it's also possible to execute the reports as the report owners/creators. This is typically
needed if there isn't a central service account that has access to all objects or databases (e.g.
when using user impersonation on database connections). For this there's the config flag
`ALERTS_REPORTS_EXECUTE_AS` which makes it possible to customize how alerts and reports are executed.
To first try to execute as the creator in the owners list (if present), then fall
back to the creator, then the last modifier in the owners list (if present), then the
last modifier, then an owner (giving priority to the last modifier and then the
creator if either is contained within the list of owners, otherwise the first owner
will be used) and finally `THUMBNAIL_SELENIUM_USER`, set as follows:

```python
from superset.reports.types import ReportScheduleExecutor

ALERT_REPORTS_EXECUTE_AS = [
ReportScheduleExecutor.CREATOR_OWNER,
ReportScheduleExecutor.CREATOR,
ReportScheduleExecutor.MODIFIER_OWNER,
ReportScheduleExecutor.MODIFIER,
ReportScheduleExecutor.OWNER,
ReportScheduleExecutor.SELENIUM,
]
```

**Important notes**

- Be mindful of the concurrency setting for celery (using `-c 4`). Selenium/webdriver instances can
consume a lot of CPU / memory on your servers.
- In some cases, if you notice a lot of leaked geckodriver processes, try running your celery
processes with `celery worker --pool=prefork --max-tasks-per-child=128 ...`
- It is recommended to run separate workers for the `sql_lab` and `email_reports` tasks. This can be
done using the `queue` field in `CELERY_ANNOTATIONS`.
done using the `queue` field in `task_annotations`.
- Adjust `WEBDRIVER_BASEURL` in your configuration file if celery workers can’t access Superset via
its default value of `http://0.0.0.0:8080/`.

Expand Down
19 changes: 19 additions & 0 deletions superset/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from superset.advanced_data_type.types import AdvancedDataType
from superset.constants import CHANGE_ME_SECRET_KEY
from superset.jinja_context import BaseTemplateProcessor
from superset.reports.types import ReportScheduleExecutor
from superset.stats_logger import DummyStatsLogger
from superset.superset_typing import CacheConfig
from superset.utils.core import is_test, NO_TIME_RANGE, parse_boolean_string
Expand Down Expand Up @@ -1143,6 +1144,24 @@ def EMAIL_HEADER_MUTATOR( # pylint: disable=invalid-name,unused-argument
# sliding cron window size, should be synced with the celery beat config minus 1 second
ALERT_REPORTS_CRON_WINDOW_SIZE = 59
ALERT_REPORTS_WORKING_TIME_OUT_KILL = True
# Which user to attempt to execute Alerts/Reports as. By default,
# use the user defined in the `THUMBNAIL_SELENIUM_USER` config parameter.
# To first try to execute as the creator in the owners list (if present), then fall
# back to the creator, then the last modifier in the owners list (if present), then the
# last modifier, then an owner (giving priority to the last modifier and then the
# creator if either is contained within the list of owners, otherwise the first owner
# will be used) and finally `THUMBNAIL_SELENIUM_USER`, set as follows:
# ALERT_REPORTS_EXECUTE_AS = [
# ReportScheduleExecutor.CREATOR_OWNER,
# ReportScheduleExecutor.CREATOR,
# ReportScheduleExecutor.MODIFIER_OWNER,
# ReportScheduleExecutor.MODIFIER,
# ReportScheduleExecutor.OWNER,
# ReportScheduleExecutor.SELENIUM,
# ]
ALERT_REPORTS_EXECUTE_AS: List[ReportScheduleExecutor] = [
ReportScheduleExecutor.SELENIUM
]
# if ALERT_REPORTS_WORKING_TIME_OUT_KILL is True, set a celery hard timeout
# Equal to working timeout + ALERT_REPORTS_WORKING_TIME_OUT_LAG
ALERT_REPORTS_WORKING_TIME_OUT_LAG = int(timedelta(seconds=10).total_seconds())
Expand Down
4 changes: 2 additions & 2 deletions superset/reports/commands/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,8 @@ class ReportScheduleNotificationError(CommandException):
message = _("Alert on grace period")


class ReportScheduleSelleniumUserNotFoundError(CommandException):
message = _("Report Schedule sellenium user not found")
class ReportScheduleUserNotFoundError(CommandException):
message = _("Report Schedule user not found")


class ReportScheduleStateNotFoundError(CommandException):
Expand Down
50 changes: 24 additions & 26 deletions superset/reports/commands/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,9 @@

import pandas as pd
from celery.exceptions import SoftTimeLimitExceeded
from flask_appbuilder.security.sqla.models import User
from sqlalchemy.orm import Session

from superset import app, security_manager
from superset import app
from superset.commands.base import BaseCommand
from superset.commands.exceptions import CommandException
from superset.common.chart_data import ChartDataResultFormat, ChartDataResultType
Expand All @@ -45,7 +44,6 @@
ReportSchedulePreviousWorkingError,
ReportScheduleScreenshotFailedError,
ReportScheduleScreenshotTimeout,
ReportScheduleSelleniumUserNotFoundError,
ReportScheduleStateNotFoundError,
ReportScheduleUnexpectedError,
ReportScheduleWorkingTimeoutError,
Expand All @@ -67,6 +65,7 @@
from superset.reports.notifications import create_notification
from superset.reports.notifications.base import NotificationContent
from superset.reports.notifications.exceptions import NotificationError
from superset.reports.utils import get_executor
from superset.utils.celery import session_scope
from superset.utils.core import HeaderDataType, override_user
from superset.utils.csv import get_chart_csv_data, get_chart_dataframe
Expand All @@ -77,13 +76,6 @@
logger = logging.getLogger(__name__)


def _get_user() -> User:
user = security_manager.find_user(username=app.config["THUMBNAIL_SELENIUM_USER"])
if not user:
raise ReportScheduleSelleniumUserNotFoundError()
return user


class BaseReportState:
current_states: List[ReportState] = []
initial: bool = False
Expand Down Expand Up @@ -182,11 +174,11 @@ def _get_url(
**kwargs,
)

# If we need to render dashboard in a specific sate, use stateful permalink
# If we need to render dashboard in a specific state, use stateful permalink
dashboard_state = self._report_schedule.extra.get("dashboard")
if dashboard_state:
permalink_key = CreateDashboardPermalinkCommand(
dashboard_id=self._report_schedule.dashboard_id,
dashboard_id=str(self._report_schedule.dashboard_id),
state=dashboard_state,
).run()
return get_url_path("Superset.dashboard_permalink", key=permalink_key)
Expand All @@ -206,7 +198,7 @@ def _get_screenshots(self) -> List[bytes]:
:raises: ReportScheduleScreenshotFailedError
"""
url = self._get_url()
user = _get_user()
user = get_executor(self._report_schedule)
if self._report_schedule.chart:
screenshot: Union[ChartScreenshot, DashboardScreenshot] = ChartScreenshot(
url,
Expand Down Expand Up @@ -236,16 +228,15 @@ def _get_screenshots(self) -> List[bytes]:

def _get_csv_data(self) -> bytes:
url = self._get_url(result_format=ChartDataResultFormat.CSV)
auth_cookies = machine_auth_provider_factory.instance.get_auth_cookies(
_get_user()
)
user = get_executor(self._report_schedule)
auth_cookies = machine_auth_provider_factory.instance.get_auth_cookies(user)

if self._report_schedule.chart.query_context is None:
logger.warning("No query context found, taking a screenshot to generate it")
self._update_query_context()

try:
logger.info("Getting chart from %s", url)
logger.info("Getting chart from %s as user %s", url, user.username)
csv_data = get_chart_csv_data(url, auth_cookies)
except SoftTimeLimitExceeded as ex:
raise ReportScheduleCsvTimeout() from ex
Expand All @@ -262,16 +253,15 @@ def _get_embedded_data(self) -> pd.DataFrame:
Return data as a Pandas dataframe, to embed in notifications as a table.
"""
url = self._get_url(result_format=ChartDataResultFormat.JSON)
auth_cookies = machine_auth_provider_factory.instance.get_auth_cookies(
_get_user()
)
user = get_executor(self._report_schedule)
auth_cookies = machine_auth_provider_factory.instance.get_auth_cookies(user)

if self._report_schedule.chart.query_context is None:
logger.warning("No query context found, taking a screenshot to generate it")
self._update_query_context()

try:
logger.info("Getting chart from %s", url)
logger.info("Getting chart from %s as user %s", url, user.username)
dataframe = get_chart_dataframe(url, auth_cookies)
except SoftTimeLimitExceeded as ex:
raise ReportScheduleDataFrameTimeout() from ex
Expand Down Expand Up @@ -674,10 +664,16 @@ def __init__(self, task_id: str, model_id: int, scheduled_dttm: datetime):
def run(self) -> None:
with session_scope(nullpool=True) as session:
try:
with override_user(_get_user()):
self.validate(session=session)
if not self._model:
raise ReportScheduleExecuteUnexpectedError()
self.validate(session=session)
if not self._model:
raise ReportScheduleExecuteUnexpectedError()
user = get_executor(self._model)
with override_user(user):
logger.info(
"Running report schedule %s as user %s",
self._execution_id,
user.username,
)
ReportScheduleStateMachine(
session, self._execution_id, self._model, self._scheduled_dttm
).run()
Expand All @@ -695,6 +691,8 @@ def validate( # pylint: disable=arguments-differ
self._model_id,
self._execution_id,
)
self._model = ReportScheduleDAO.find_by_id(self._model_id, session=session)
self._model = (
session.query(ReportSchedule).filter_by(id=self._model_id).one_or_none()
)
if not self._model:
raise ReportScheduleNotFoundError()
10 changes: 10 additions & 0 deletions superset/reports/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,20 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from enum import Enum
from typing import TypedDict

from superset.dashboards.permalink.types import DashboardPermalinkState


class ReportScheduleExtra(TypedDict):
dashboard: DashboardPermalinkState


class ReportScheduleExecutor(str, Enum):
SELENIUM = "selenium"
CREATOR = "creator"
CREATOR_OWNER = "creator_owner"
MODIFIER = "modifier"
MODIFIER_OWNER = "modifier_owner"
OWNER = "owner"
71 changes: 71 additions & 0 deletions superset/reports/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from flask_appbuilder.security.sqla.models import User

from superset import app, security_manager
from superset.reports.commands.exceptions import ReportScheduleUserNotFoundError
from superset.reports.models import ReportSchedule
from superset.reports.types import ReportScheduleExecutor


# pylint: disable=too-many-branches
def get_executor(report_schedule: ReportSchedule) -> User:
"""
Extract the user that should be used to execute a report schedule as.
:param report_schedule: The report to execute
:return: User to execute the report as
"""
user_types = app.config["ALERT_REPORTS_EXECUTE_AS"]
owners = report_schedule.owners
owner_dict = {owner.id: owner for owner in owners}
for user_type in user_types:
if user_type == ReportScheduleExecutor.SELENIUM:
username = app.config["THUMBNAIL_SELENIUM_USER"]
if username and (user := security_manager.find_user(username=username)):
return user
if user_type == ReportScheduleExecutor.CREATOR_OWNER:
if (user := report_schedule.created_by) and (
owner := owner_dict.get(user.id)
):
return owner
if user_type == ReportScheduleExecutor.CREATOR:
if user := report_schedule.created_by:
return user
if user_type == ReportScheduleExecutor.MODIFIER_OWNER:
if (user := report_schedule.changed_by) and (
owner := owner_dict.get(user.id)
):
return owner
if user_type == ReportScheduleExecutor.MODIFIER:
if user := report_schedule.changed_by:
return user
if user_type == ReportScheduleExecutor.OWNER:
owners = report_schedule.owners
if len(owners) == 1:
return owners[0]
if len(owners) > 1:
if modifier := report_schedule.changed_by:
if modifier and (user := owner_dict.get(modifier.id)):
return user
if creator := report_schedule.created_by:
if creator and (user := owner_dict.get(creator.id)):
return user
return owners[0]

raise ReportScheduleUserNotFoundError()
8 changes: 5 additions & 3 deletions superset/utils/machine_auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.

from __future__ import annotations

import importlib
import logging
from typing import Callable, Dict, TYPE_CHECKING
Expand All @@ -34,7 +36,7 @@

class MachineAuthProvider:
def __init__(
self, auth_webdriver_func_override: Callable[[WebDriver, "User"], WebDriver]
self, auth_webdriver_func_override: Callable[[WebDriver, User], WebDriver]
):
# This is here in order to allow for the authenticate_webdriver func to be
# overridden via config, as opposed to the entire provider implementation
Expand All @@ -43,7 +45,7 @@ def __init__(
def authenticate_webdriver(
self,
driver: WebDriver,
user: "User",
user: User,
) -> WebDriver:
"""
Default AuthDriverFuncType type that sets a session cookie flask-login style
Expand All @@ -69,7 +71,7 @@ def authenticate_webdriver(
return driver

@staticmethod
def get_auth_cookies(user: "User") -> Dict[str, str]:
def get_auth_cookies(user: User) -> Dict[str, str]:
# Login with the user specified to get the reports
with current_app.test_request_context("/login"):
login_user(user)
Expand Down
Loading

0 comments on commit a02a778

Please sign in to comment.