Skip to content

Commit

Permalink
perf: refactor SIP-68 db migrations with INSERT SELECT FROM (apache#1…
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmud authored May 11, 2022
1 parent 57e3237 commit 30b5a5d
Show file tree
Hide file tree
Showing 30 changed files with 2,337 additions and 1,969 deletions.
72 changes: 40 additions & 32 deletions superset/columns/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
These models are not fully implemented, and shouldn't be used yet.
"""

import sqlalchemy as sa
from flask_appbuilder import Model

Expand All @@ -33,6 +32,8 @@
ImportExportMixin,
)

UNKOWN_TYPE = "UNKNOWN"


class Column(
Model,
Expand All @@ -52,51 +53,58 @@ class Column(

id = sa.Column(sa.Integer, primary_key=True)

# Assuming the column is an aggregation, is it additive? Useful for determining which
# aggregations can be done on the metric. Eg, ``COUNT(DISTINCT user_id)`` is not
# additive, so it shouldn't be used in a ``SUM``.
is_additive = sa.Column(sa.Boolean, default=False)

# Is this column an aggregation (metric)?
is_aggregation = sa.Column(sa.Boolean, default=False)

is_filterable = sa.Column(sa.Boolean, nullable=False, default=True)
is_dimensional = sa.Column(sa.Boolean, nullable=False, default=False)

# Is an increase desired? Useful for displaying the results of A/B tests, or setting
# up alerts. Eg, this is true for "revenue", but false for "latency".
is_increase_desired = sa.Column(sa.Boolean, default=True)

# Column is managed externally and should be read-only inside Superset
is_managed_externally = sa.Column(sa.Boolean, nullable=False, default=False)

# Is this column a partition? Useful for scheduling queries and previewing the latest
# data.
is_partition = sa.Column(sa.Boolean, default=False)

# Does the expression point directly to a physical column?
is_physical = sa.Column(sa.Boolean, default=True)

# Is this a spatial column? This could be leveraged in the future for spatial
# visualizations.
is_spatial = sa.Column(sa.Boolean, default=False)

# Is this a time column? Useful for plotting time series.
is_temporal = sa.Column(sa.Boolean, default=False)

# We use ``sa.Text`` for these attributes because (1) in modern databases the
# performance is the same as ``VARCHAR``[1] and (2) because some table names can be
# **really** long (eg, Google Sheets URLs).
#
# [1] https://www.postgresql.org/docs/9.1/datatype-character.html
name = sa.Column(sa.Text)
type = sa.Column(sa.Text)
# Raw type as returned and used by db engine.
type = sa.Column(sa.Text, default=UNKOWN_TYPE)

# Columns are defined by expressions. For tables, these are the actual columns names,
# and should match the ``name`` attribute. For datasets, these can be any valid SQL
# expression. If the SQL expression is an aggregation the column is a metric,
# otherwise it's a computed column.
expression = sa.Column(sa.Text)

# Does the expression point directly to a physical column?
is_physical = sa.Column(sa.Boolean, default=True)
unit = sa.Column(sa.Text)

# Additional metadata describing the column.
description = sa.Column(sa.Text)
warning_text = sa.Column(sa.Text)
unit = sa.Column(sa.Text)

# Is this a time column? Useful for plotting time series.
is_temporal = sa.Column(sa.Boolean, default=False)

# Is this a spatial column? This could be leveraged in the future for spatial
# visualizations.
is_spatial = sa.Column(sa.Boolean, default=False)

# Is this column a partition? Useful for scheduling queries and previewing the latest
# data.
is_partition = sa.Column(sa.Boolean, default=False)

# Is this column an aggregation (metric)?
is_aggregation = sa.Column(sa.Boolean, default=False)

# Assuming the column is an aggregation, is it additive? Useful for determining which
# aggregations can be done on the metric. Eg, ``COUNT(DISTINCT user_id)`` is not
# additive, so it shouldn't be used in a ``SUM``.
is_additive = sa.Column(sa.Boolean, default=False)

# Is an increase desired? Useful for displaying the results of A/B tests, or setting
# up alerts. Eg, this is true for "revenue", but false for "latency".
is_increase_desired = sa.Column(sa.Boolean, default=True)

# Column is managed externally and should be read-only inside Superset
is_managed_externally = sa.Column(sa.Boolean, nullable=False, default=False)
external_url = sa.Column(sa.Text, nullable=True)

def __repr__(self) -> str:
return f"<Column id={self.id}>"
6 changes: 3 additions & 3 deletions superset/connectors/base/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from superset.models.slice import Slice
from superset.superset_typing import FilterValue, FilterValues, QueryObjectDict
from superset.utils import core as utils
from superset.utils.core import GenericDataType
from superset.utils.core import GenericDataType, MediumText

METRIC_FORM_DATA_PARAMS = [
"metric",
Expand Down Expand Up @@ -586,7 +586,7 @@ class BaseColumn(AuditMixinNullable, ImportExportMixin):
type = Column(Text)
groupby = Column(Boolean, default=True)
filterable = Column(Boolean, default=True)
description = Column(Text)
description = Column(MediumText())
is_dttm = None

# [optional] Set this to support import/export functionality
Expand Down Expand Up @@ -672,7 +672,7 @@ class BaseMetric(AuditMixinNullable, ImportExportMixin):
metric_name = Column(String(255), nullable=False)
verbose_name = Column(String(1024))
metric_type = Column(String(32))
description = Column(Text)
description = Column(MediumText())
d3format = Column(String(128))
warning_text = Column(Text)

Expand Down
Loading

0 comments on commit 30b5a5d

Please sign in to comment.